/* keyhash_create() * * The real creation function, which takes arguments for memory sizes. * This is abstracted to a static function because it's used by both * Create() and Clone() but slightly differently. * * Args: hashsize - size of hash table; this must be a power of two. * init_key_alloc - initial allocation for # of keys. * init_string_alloc - initial allocation for total size of key strings. * * Returns: An allocated hash table structure; or NULL on failure. */ ESL_KEYHASH * keyhash_create(uint32_t hashsize, int init_key_alloc, int init_string_alloc) { ESL_KEYHASH *kh = NULL; int i; int status; ESL_ALLOC(kh, sizeof(ESL_KEYHASH)); kh->hashtable = NULL; kh->key_offset = NULL; kh->nxt = NULL; kh->smem = NULL; kh->hashsize = hashsize; kh->kalloc = init_key_alloc; kh->salloc = init_string_alloc; ESL_ALLOC(kh->hashtable, sizeof(int) * kh->hashsize); for (i = 0; i < kh->hashsize; i++) kh->hashtable[i] = -1; ESL_ALLOC(kh->key_offset, sizeof(int) * kh->kalloc); ESL_ALLOC(kh->nxt, sizeof(int) * kh->kalloc); for (i = 0; i < kh->kalloc; i++) kh->nxt[i] = -1; ESL_ALLOC(kh->smem, sizeof(char) * kh->salloc); kh->nkeys = 0; kh->sn = 0; return kh; ERROR: esl_keyhash_Destroy(kh); return NULL; }
/* Function: esl_dmatrix_Create() * * Purpose: Creates a general <n> x <m> matrix (<n> rows, <m> * columns). * * Args: <n> - number of rows; $>= 1$ * <m> - number of columns; $>= 1$ * * Returns: a pointer to a new <ESL_DMATRIX> object. Caller frees * with <esl_dmatrix_Destroy()>. * * Throws: <NULL> if an allocation failed. */ ESL_DMATRIX * esl_dmatrix_Create(int n, int m) { ESL_DMATRIX *A = NULL; int r; int status; ESL_ALLOC(A, sizeof(ESL_DMATRIX)); A->mx = NULL; A->n = n; A->m = m; ESL_ALLOC(A->mx, sizeof(double *) * n); A->mx[0] = NULL; ESL_ALLOC(A->mx[0], sizeof(double) * n * m); for (r = 1; r < n; r++) A->mx[r] = A->mx[0] + r*m; A->type = eslGENERAL; A->ncells = n * m; return A; ERROR: esl_dmatrix_Destroy(A); return NULL; }
/* Function: p7_coords2_hash_Create() * Synopsis: Create a <P7_COORDS2_HASH> * * Purpose: Allocate and initialize a <P7_COORDS2_HASH> hash table for storing * lots of coord2 arrays (i.e. domain annotations). * * The <init_*> arguments let you set non-default initial * allocation sizes. To use the default for any of these, * pass a 0 value. Defaults are 128 for the initial * hashtable size <init_hashsize>; 128 for the initial * allocation for number of keys to be stored <init_nkeyalloc>; * and 2048 for the initial allocation for the number * of integers to be stored in key data. * * In general the initialization defaults should be * fine. All three are grown automatically as needed, as * you add keys to the hash. * * "key data" means <n> <start>/<end> pairs, plus <n> * itself: it takes 2n+1 integers to store a <P7_COORD2> * array of length <n>. * * <hashsize> must be a power of 2; remember that if you * pass a non-default value. * * Args: init_hashsize : initial hashtable size. Power of 2; >0. * init_keyalloc : initial allocation for # keys. >0. * init_calloc : initial allocation for key data. >0. * * Returns: pointer to the new <P7_COORDS2_HASH> object on success. * * Throws: <NULL> on allocation failure. */ P7_COORDS2_HASH * p7_coords2_hash_Create(int32_t init_hashsize, int32_t init_nkeyalloc, int32_t init_calloc) { P7_COORDS2_HASH *ch = NULL; int32_t i; int status; ESL_DASSERT1(( init_hashsize == 0 || (init_hashsize && ((init_hashsize & (init_hashsize-1)) == 0)))); /* hashsize is a power of 2 (bitshifting trickery) */ ESL_ALLOC(ch, sizeof(P7_COORDS2_HASH)); ch->hashtable = NULL; ch->key_offset = NULL; ch->nxt = NULL; ch->cmem = NULL; ch->nkeys = 0; ch->cn = 0; ch->hashsize = (init_hashsize > 0 ? init_hashsize : 128); ch->kalloc = (init_nkeyalloc > 0 ? init_nkeyalloc : 128); ch->calloc = (init_calloc > 0 ? init_calloc : 2048); ESL_ALLOC(ch->hashtable, sizeof(int32_t) * ch->hashsize); for (i = 0; i < ch->hashsize; i++) ch->hashtable[i] = -1; ESL_ALLOC(ch->key_offset, sizeof(int32_t) * ch->kalloc); ESL_ALLOC(ch->nxt, sizeof(int32_t) * ch->kalloc); ESL_ALLOC(ch->cmem, sizeof(int32_t) * ch->calloc); return ch; ERROR: p7_coords2_hash_Destroy(ch); return NULL; }
P7_GBANDS * p7_gbands_Create(void) { P7_GBANDS *bnd = NULL; int init_segalloc = 4; int init_rowalloc = 64; int status; ESL_ALLOC(bnd, sizeof(P7_GBANDS)); bnd->nseg = 0; bnd->nrow = 0; bnd->L = 0; bnd->M = 0; bnd->ncell = 0; bnd->imem = NULL; bnd->kmem = NULL; ESL_ALLOC(bnd->imem, sizeof(int) * init_segalloc * 2); /* *2: for ia, ib pairs */ ESL_ALLOC(bnd->kmem, sizeof(int) * init_rowalloc * p7_GBANDS_NK); bnd->segalloc = init_segalloc; bnd->rowalloc = init_rowalloc; return bnd; ERROR: p7_gbands_Destroy(bnd); return NULL; }
ESL_HMX * esl_hmx_Create(int allocL, int allocM) { ESL_HMX *mx = NULL; int i; int status; ESL_ALLOC(mx, sizeof(ESL_HMX)); mx->dp_mem = NULL; mx->dp = NULL; mx->sc = NULL; ESL_ALLOC(mx->dp_mem, sizeof(float) * (allocL+1) * allocM); mx->ncells = (allocL+1) * allocM; ESL_ALLOC(mx->dp, sizeof (float *) * (allocL+1)); ESL_ALLOC(mx->sc, sizeof (float) * (allocL+2)); mx->allocR = allocL+1; for (i = 0; i <= allocL; i++) mx->dp[i] = mx->dp_mem + i*allocM; mx->validR = allocL+1; mx->allocM = allocM; mx->L = 0; mx->M = 0; return mx; ERROR: esl_hmx_Destroy(mx); return NULL; }
/* Function: esl_recorder_Create() * Synopsis: Create an <ESL_RECORDER>. * Incept: SRE, Fri Dec 25 16:27:40 2009 [Casa de Gatos] * * Purpose: Allocate a new <ESL_RECORDER> that will read * line-by-line from input stream <fp>, saving * a history of up to <maxlines> lines. * * Returns: pointer to the new <ESL_RECORDER> on success. * * Throws: <NULL> on allocation failure. */ ESL_RECORDER * esl_recorder_Create(FILE *fp, int maxlines) { ESL_RECORDER *rc = NULL; int i; int status; ESL_ALLOC(rc, sizeof(ESL_RECORDER)); rc->fp = fp; rc->line = NULL; rc->nalloc = maxlines; rc->lalloc = NULL; rc->offset = NULL; rc->nread = 0; rc->ncurr = 0; rc->baseline = 0; rc->markline = -1; ESL_ALLOC(rc->line, sizeof(char *) * rc->nalloc); for (i = 0; i < rc->nalloc; i++) rc->line[i] = NULL; ESL_ALLOC(rc->lalloc, sizeof(int) * rc->nalloc); for (i = 0; i < rc->nalloc; i++) rc->lalloc[i] = 0; ESL_ALLOC(rc->offset, sizeof(off_t) * rc->nalloc); for (i = 0; i < rc->nalloc; i++) rc->offset[i] = 0; return rc; ERROR: esl_recorder_Destroy(rc); return NULL; }
/* Function: p7_bg_Create() * Incept: SRE, Fri Jan 12 13:32:51 2007 [Janelia] * * Purpose: Allocate a <P7_BG> object for digital alphabet <abc>, * initializes it to appropriate default values, and * returns a pointer to it. * * For protein models, default iid background frequencies * are set (by <p7_AminoFrequencies()>) to average * SwissProt residue composition. For DNA, RNA and other * alphabets, default frequencies are set to a uniform * distribution. * * The model composition <bg->mcomp[]> is not initialized * here; neither is the filter null model <bg->fhmm>. To * use the filter null model, caller will want to * initialize these fields by calling * <p7_bg_SetFilterByHMM()>. * * Throws: <NULL> on allocation failure. * * Xref: STL11/125. */ P7_BG * p7_bg_Create(const ESL_ALPHABET *abc) { P7_BG *bg = NULL; int status; ESL_ALLOC(bg, sizeof(P7_BG)); bg->f = NULL; bg->fhmm = NULL; ESL_ALLOC(bg->f, sizeof(float) * abc->K); if ((bg->fhmm = esl_hmm_Create(abc, 2)) == NULL) goto ERROR; if (abc->type == eslAMINO) { if (p7_AminoFrequencies(bg->f) != eslOK) goto ERROR; } else esl_vec_FSet(bg->f, abc->K, 1. / (float) abc->K); bg->p1 = 350./351.; bg->omega = 1./256.; bg->abc = abc; return bg; ERROR: p7_bg_Destroy(bg); return NULL; }
/* Function: p7_oprofile_CreateBlock() * Synopsis: Create a new block of empty <P7_OM_BLOCK>. * Incept: * * Purpose: Creates a block of empty <P7_OM_BLOCK> profile objects. * * Returns: a pointer to the new <P7_OM_BLOCK>. Caller frees this * with <p7_oprofile_DestroyBlock()>. * * Throws: <NULL> if allocation fails. */ P7_OM_BLOCK * p7_oprofile_CreateBlock(int count) { int i = 0; P7_OM_BLOCK *block = NULL; int status = eslOK; ESL_ALLOC(block, sizeof(*block)); block->count = 0; block->listSize = 0; block->list = NULL; ESL_ALLOC(block->list, sizeof(P7_OPROFILE *) * count); block->listSize = count; for (i = 0; i < count; ++i) { block->list[i] = NULL; } return block; ERROR: if (block != NULL) { if (block->list != NULL) free(block->list); free(block); } return NULL; }
static int parse_replace_string(const char *rstring, char **ret_from, char **ret_to) { int status; int rlen, mid, i; int is_valid = FALSE; char *from = NULL; char *to = NULL; /* Note: we could use ESL_REGEXP but then multiple ':'s in rstring could cause problems */ rlen = strlen(rstring); /* check validity of rstring: must be "<s1>:<s2>" with len(<s1>)==len(<s2>) */ if((rlen % 2) != 0) { /* odd num chars, good */ mid = rlen / 2; if(rstring[mid] == ':') { /* middle character is ':', good */ ESL_ALLOC(from, sizeof(char) * (mid+1)); ESL_ALLOC(to, sizeof(char) * (mid+1)); for(i = 0; i < mid; i++) from[i] = rstring[i]; for(i = mid+1; i < rlen; i++) to[i-(mid+1)] = rstring[i]; from[mid] = '\0'; to[mid] = '\0'; is_valid = TRUE; } } if(! is_valid) esl_fatal("--replace takes arg of <s1>:<s2> with len(<s1>) == len(<s2>); %s not recognized", rstring); *ret_from = from; *ret_to = to; return eslOK; ERROR: if(from != NULL) free(from); if(to != NULL) free(to); return status; }
/* Function: esl_msaweight_IDFilter() * Synopsis: Filter by %ID. * Incept: ER, Wed Oct 29 10:06:43 2008 [Janelia] * * Purpose: Constructs a new alignment by removing near-identical * sequences from a given alignment (where identity is * calculated *based on the alignment*). * Does not affect the given alignment. * Keeps earlier sequence, discards later one. * * Usually called as an ad hoc sequence "weighting" mechanism. * * Limitations: * Unparsed Stockholm markup is not propagated into the * new alignment. * * Return: <eslOK> on success, and the <newmsa>. * * Throws: <eslEMEM> on allocation error. <eslEINVAL> if a pairwise * identity calculation fails because of corrupted sequence * data. In either case, the <msa> is unmodified. * * Xref: squid::weight.c::FilterAlignment(). */ int esl_msaweight_IDFilter(const ESL_MSA *msa, double maxid, ESL_MSA **ret_newmsa) { int *list = NULL; /* array of seqs in new msa */ int *useme = NULL; /* TRUE if seq is kept in new msa */ int nnew; /* number of seqs in new alignment */ double ident; /* pairwise percentage id */ int i,j; /* seqs counters*/ int remove; /* TRUE if sq is to be removed */ int status; /* Contract checks */ ESL_DASSERT1( (msa != NULL) ); ESL_DASSERT1( (msa->nseq >= 1) ); ESL_DASSERT1( (msa->alen >= 1) ); /* allocate */ ESL_ALLOC(list, sizeof(int) * msa->nseq); ESL_ALLOC(useme, sizeof(int) * msa->nseq); esl_vec_ISet(useme, msa->nseq, 0); /* initialize array */ /* find which seqs to keep (list) */ nnew = 0; for (i = 0; i < msa->nseq; i++) { remove = FALSE; for (j = 0; j < nnew; j++) { if (! (msa->flags & eslMSA_DIGITAL)) { if ((status = esl_dst_CPairId(msa->aseq[i], msa->aseq[list[j]], &ident, NULL, NULL)) != eslOK) goto ERROR; } #ifdef eslAUGMENT_ALPHABET else { if ((status = esl_dst_XPairId(msa->abc, msa->ax[i], msa->ax[list[j]], &ident, NULL, NULL)) != eslOK) goto ERROR; } #endif if (ident > maxid) { remove = TRUE; break; } } if (remove == FALSE) { list[nnew++] = i; useme[i] = TRUE; } } if ((status = esl_msa_SequenceSubset(msa, useme, ret_newmsa)) != eslOK) goto ERROR; free(list); free(useme); return eslOK; ERROR: if (list != NULL) free(list); if (useme != NULL) free(useme); return status; }
/* Function: esl_hyperexp_Create() * * Purpose: Creates an object to hold parameters for a <K>-component * hyperexponential. * * Parameters in the object are initialized * ($q_k = \frac{1}{K}$, $\lambda_k = 1$, $\mu = 0$), but * the caller will want to set these according to its own * purposes. * * Args: K - number of components in the mixture * * Returns: ptr to newly allocated/initialized <ESL_HYPEREXP> object. * * Throws: NULL on allocation failure. */ ESL_HYPEREXP * esl_hyperexp_Create(int K) { int status; ESL_HYPEREXP *h = NULL; int k; ESL_ALLOC(h, sizeof(ESL_HYPEREXP)); h->q = h->lambda = h->wrk = NULL; h->fixlambda = NULL; h->K = K; h->fixmix = FALSE; ESL_ALLOC(h->q, sizeof(double) * K); ESL_ALLOC(h->lambda, sizeof(double) * K); ESL_ALLOC(h->wrk, sizeof(double) * K); ESL_ALLOC(h->fixlambda, sizeof(char) * K); for (k = 0; k < K; k++) { h->q[k] = 1. / (double) K; h->lambda[k] = 1.; h->fixlambda[k]= 0; } h->mu = 0.; return h; ERROR: esl_hyperexp_Destroy(h); return NULL; }
/* Function: p7_tophits_Create() * Synopsis: Allocate a hit list. * * Purpose: Allocates a new <P7_TOPHITS> hit list, for an initial * allocation of <int_hit_alloc> hits (this will be grown * later as needed). Return a pointer to it. * * Args: init_hit_alloc - initial allocation size, # of hits. * Often p7_TOPHITS_DEFAULT_INIT_ALLOC. * * Throws: <NULL> on allocation failure. */ P7_TOPHITS * p7_tophits_Create(int init_hit_alloc) { P7_TOPHITS *h = NULL; int status; ESL_ALLOC(h, sizeof(P7_TOPHITS)); h->hit = NULL; h->unsrt = NULL; if (( h->unsrt = p7_hit_Create(init_hit_alloc) ) == NULL) goto ERROR; ESL_ALLOC(h->hit, sizeof(P7_HIT *) * init_hit_alloc); h->Nalloc = init_hit_alloc; h->N = 0; h->nreported = 0; h->nincluded = 0; h->is_sorted_by_sortkey = TRUE; /* but only because there's 0 hits */ h->is_sorted_by_seqidx = FALSE; h->hit[0] = h->unsrt; /* if you're going to call it "sorted" when it contains just one hit, you need this */ return h; ERROR: p7_tophits_Destroy(h); return NULL; }
/* Function: annotate_model() * * Purpose: Transfer rf, cs, and other optional annotation from the alignment * to the new model. * * Args: hmm - [M] new model to annotate * matassign - which alignment columns are MAT; [1..alen] * msa - alignment, including annotation to transfer * * Return: <eslOK> on success. * * Throws: <eslEMEM> on allocation error. */ static int annotate_model(P7_HMM *hmm, int *matassign, ESL_MSA *msa) { int apos; /* position in matassign, 1.alen */ int k; /* position in model, 1.M */ int status; /* Reference coord annotation */ if (msa->rf != NULL) { ESL_ALLOC(hmm->rf, sizeof(char) * (hmm->M+2)); hmm->rf[0] = ' '; for (apos = k = 1; apos <= msa->alen; apos++) if (matassign[apos]) hmm->rf[k++] = msa->rf[apos-1]; /* watch off-by-one in msa's rf */ hmm->rf[k] = '\0'; hmm->flags |= p7H_RF; } /* Model mask annotation */ if (msa->mm != NULL) { ESL_ALLOC(hmm->mm, sizeof(char) * (hmm->M+2)); hmm->mm[0] = ' '; for (apos = k = 1; apos <= msa->alen; apos++) if (matassign[apos]) hmm->mm[k++] = msa->mm[apos-1]; hmm->mm[k] = '\0'; hmm->flags |= p7H_MMASK; } /* Consensus structure annotation */ if (msa->ss_cons != NULL) { ESL_ALLOC(hmm->cs, sizeof(char) * (hmm->M+2)); hmm->cs[0] = ' '; for (apos = k = 1; apos <= msa->alen; apos++) if (matassign[apos]) hmm->cs[k++] = msa->ss_cons[apos-1]; hmm->cs[k] = '\0'; hmm->flags |= p7H_CS; } /* Surface accessibility annotation */ if (msa->sa_cons != NULL) { ESL_ALLOC(hmm->ca, sizeof(char) * (hmm->M+2)); hmm->ca[0] = ' '; for (apos = k = 1; apos <= msa->alen; apos++) if (matassign[apos]) hmm->ca[k++] = msa->sa_cons[apos-1]; hmm->ca[k] = '\0'; hmm->flags |= p7H_CA; } /* The alignment map (1..M in model, 1..alen in alignment) */ ESL_ALLOC(hmm->map, sizeof(int) * (hmm->M+1)); hmm->map[0] = 0; for (apos = k = 1; apos <= msa->alen; apos++) if (matassign[apos]) hmm->map[k++] = apos; hmm->flags |= p7H_MAP; return eslOK; ERROR: return status; }
/* Function: cp9_Copy() * Synopsis: Copy a CM plan 9 HMM. * * Purpose: Copies cp9 hmm <src> to cp9 hmm <dst>, where <dst> * has already been allocated to be of sufficient size. * * <src> should be properly normalized, no check is done to * ensure that. If <src> is logoddsified (src->flags & * CPLAN9_HASBITS) its bit scores will be copied to <dst>, * otherwise they are invalid and won't be copied. * * Returns: <eslOK> on success. * * Throws: <eslEMEM> on allocation error; <eslEINVAL> if <dst> is too small * to fit <src>. */ int cp9_Copy(const CP9_t *src, CP9_t *dst) { int status; int k; int src_has_bits = (src->flags & CPLAN9_HASBITS) ? TRUE : FALSE; if (src->M != dst->M) return eslEINVAL; dst->abc = src->abc; for(k = 0; k <= src->M; k++) { esl_vec_FCopy(src->t[k], cp9_NTRANS, dst->t[k]); esl_vec_FCopy(src->mat[k], src->abc->K, dst->mat[k]); esl_vec_FCopy(src->ins[k], src->abc->K, dst->ins[k]); } esl_vec_FCopy(src->begin, src->M+1, dst->begin); esl_vec_FCopy(src->end, src->M+1, dst->end); if(src_has_bits) { esl_vec_ICopy(src->bsc_mem, src->M+1, dst->bsc_mem); esl_vec_ICopy(src->esc_mem, src->M+1, dst->esc_mem); } /* exploit linear-memory of these 2d arrays */ if(src_has_bits) { esl_vec_ICopy(src->tsc_mem, cp9_NTRANS * (src->M+1), dst->tsc_mem); esl_vec_ICopy(src->msc_mem, src->abc->Kp * (src->M+1), dst->msc_mem); esl_vec_ICopy(src->isc_mem, src->abc->Kp * (src->M+1), dst->isc_mem); esl_vec_ICopy(src->otsc, cp9O_NTRANS * (src->M+1), dst->otsc); } /* EL info */ dst->el_self = src->el_self; dst->el_selfsc = src->el_selfsc; esl_vec_ICopy(src->has_el, src->M+1, dst->has_el); esl_vec_ICopy(src->el_from_ct, src->M+2, dst->el_from_ct); for(k = 0; k <= src->M+1; k++) { if(src->el_from_ct[k] > 0) { ESL_ALLOC(dst->el_from_idx[k], sizeof(int) * src->el_from_ct[k]); ESL_ALLOC(dst->el_from_cmnd[k], sizeof(int) * src->el_from_ct[k]); esl_vec_ICopy(src->el_from_idx[k], src->el_from_ct[k], dst->el_from_idx[k]); esl_vec_ICopy(src->el_from_cmnd[k], src->el_from_ct[k], dst->el_from_cmnd[k]); } } dst->null2_omega = src->null2_omega; dst->null3_omega = src->null3_omega; esl_vec_FCopy(src->null, src->abc->K, dst->null); dst->p1 = src->p1; dst->flags = src->flags; return eslOK; ERROR: return status; }
/* Function: esl_msacluster_SingleLinkage() * Synopsis: Single linkage clustering by percent identity. * Incept: SRE, Sun Nov 5 10:11:45 2006 [Janelia] * * Purpose: Perform single link clustering of the sequences in * multiple alignment <msa>. Any pair of sequences with * percent identity $\geq$ <maxid> are linked (using * the definition from the \eslmod{distance} module). * * The resulting clustering is optionally returned in one * or more of <opt_c>, <opt_nin>, and <opt_nc>. The * <opt_c[0..nseq-1]> array assigns a cluster index * <(0..nc-1)> to each sequence. For example, <c[4] = 1> * means that sequence 4 is assigned to cluster 1. The * <opt_nin[0..nc-1]> array is the number of sequences * in each cluster. <opt_nc> is the number of clusters. * * Importantly, this algorithm runs in $O(N)$ memory, and * produces one discrete clustering. Compare to * <esl_tree_SingleLinkage()>, which requires an $O(N^2)$ * adjacency matrix, and produces a hierarchical clustering * tree. * * The algorithm is worst case $O(LN^2)$ time, for $N$ * sequences of length $L$. However, the worst case is no * links at all, and this is unusual. More typically, time * scales as about $LN \log N$. The best case scales as * $LN$, when there is just one cluster in a completely * connected graph. * * Args: msa - multiple alignment to cluster * maxid - pairwise identity threshold: cluster if $\geq$ <maxid> * opt_c - optRETURN: cluster assignments for each sequence, [0..nseq-1] * opt_nin - optRETURN: number of seqs in each cluster, [0..nc-1] * opt_nc - optRETURN: number of clusters * * Returns: <eslOK> on success; the <opt_c[0..nseq-1]> array contains * cluster indices <0..nc-1> assigned to each sequence; the * <opt_nin[0..nc-1]> array contains the number of seqs in * each cluster; and <opt_nc> contains the number of * clusters. The <opt_c> array and <opt_nin> arrays will be * allocated here, if non-<NULL>, and must be free'd by the * caller. The input <msa> is unmodified. * * The caller may pass <NULL> for either <opt_c> or * <opt_nc> if it is only interested in one of the two * results. * * Throws: <eslEMEM> on allocation failure, and <eslEINVAL> if a pairwise * comparison is invalid (which means the MSA is corrupted, so it * shouldn't happen). In either case, <opt_c> and <opt_nin> are set to <NULL> * and <opt_nc> is set to 0, and the <msa> is unmodified. */ int esl_msacluster_SingleLinkage(const ESL_MSA *msa, double maxid, int **opt_c, int **opt_nin, int *opt_nc) { int status; int *workspace = NULL; int *assignment = NULL; int *nin = NULL; int nc; int i; #ifdef eslAUGMENT_ALPHABET struct msa_param_s param; #endif /* Allocations */ ESL_ALLOC(workspace, sizeof(int) * msa->nseq * 2); ESL_ALLOC(assignment, sizeof(int) * msa->nseq); /* call to SLC API: */ if (! (msa->flags & eslMSA_DIGITAL)) status = esl_cluster_SingleLinkage((void *) msa->aseq, (size_t) msa->nseq, sizeof(char *), msacluster_clinkage, (void *) &maxid, workspace, assignment, &nc); #ifdef eslAUGMENT_ALPHABET else { param.maxid = maxid; param.abc = msa->abc; status = esl_cluster_SingleLinkage((void *) msa->ax, (size_t) msa->nseq, sizeof(ESL_DSQ *), msacluster_xlinkage, (void *) ¶m, workspace, assignment, &nc); } #endif if (opt_nin != NULL) { ESL_ALLOC(nin, sizeof(int) * nc); for (i = 0; i < nc; i++) nin[i] = 0; for (i = 0; i < msa->nseq; i++) nin[assignment[i]]++; *opt_nin = nin; } /* cleanup and return */ free(workspace); if (opt_c != NULL) *opt_c = assignment; else free(assignment); if (opt_nc != NULL) *opt_nc = nc; return eslOK; ERROR: if (workspace != NULL) free(workspace); if (assignment != NULL) free(assignment); if (nin != NULL) free(nin); if (opt_c != NULL) *opt_c = NULL; if (opt_nc != NULL) *opt_nc = 0; return status; }
/* Function: esl_hxp_FitCompleteBinned() * * Purpose: Given a histogram <g> with binned observations, where each * bin i holds some number of observed samples x with values from * lower bound l to upper bound u (that is, $l < x \leq u$), * and given a starting guess <h> for hyperexponential parameters; * * Find maximum likelihood parameters <h> by conjugate gradient * descent, starting from the initial <h> and leaving the * optimized solution in <h>. * * Returns: <eslOK> on success. * * Throws: <eslEMEM> on allocation error, and <h> is left in its * initial state. */ int esl_hxp_FitCompleteBinned(ESL_HISTOGRAM *g, ESL_HYPEREXP *h) { struct hyperexp_binned_data data; int status; double *p = NULL; double *u = NULL; double *wrk = NULL; double fx; int i; double tol = 1e-6; int np; np = 0; if (! h->fixmix) np = h->K-1; /* K-1 mix coefficients... */ for (i = 0; i < h->K; i++) /* ...and up to K lambdas free. */ if (! h->fixlambda[i]) np++; ESL_ALLOC(p, sizeof(double) * np); ESL_ALLOC(u, sizeof(double) * np); ESL_ALLOC(wrk, sizeof(double) * np * 4); /* Copy shared info into the "data" structure */ data.g = g; data.h = h; /* From h, create the parameter vector. */ hyperexp_pack_paramvector(p, np, h); /* Define the step size vector u. */ for (i = 0; i < np; i++) u[i] = 1.0; /* Feed it all to the mighty optimizer. */ status = esl_min_ConjugateGradientDescent(p, u, np, &hyperexp_complete_binned_func, &hyperexp_complete_binned_gradient, (void *) (&data), tol, wrk, &fx); if (status != eslOK) goto ERROR; /* Convert the final parameter vector back to a hyperexponential */ hyperexp_unpack_paramvector(p, np, h); free(p); free(u); free(wrk); esl_hyperexp_SortComponents(h); return eslOK; ERROR: if (p != NULL) free(p); if (u != NULL) free(u); if (wrk != NULL) free(wrk); return status; }
/* mpi_worker() * The main control for an MPI worker process. */ static void mpi_worker(ESL_GETOPTS *go, struct cfg_s *cfg) { int xstatus = eslOK; int status; P7_HMM *hmm = NULL; char *wbuf = NULL; double *xv = NULL; /* result: array of N scores */ int *av = NULL; /* optional result: array of N alignment lengths */ int wn = 0; char errbuf[eslERRBUFSIZE]; int pos; /* Worker initializes */ if ((status = minimum_mpi_working_buffer(go, cfg->N, &wn)) != eslOK) xstatus = status; ESL_ALLOC(wbuf, wn * sizeof(char)); ESL_ALLOC(xv, cfg->N * sizeof(double) + 2); if (esl_opt_GetBoolean(go, "-a")) ESL_ALLOC(av, cfg->N * sizeof(int)); /* Main worker loop */ while (p7_hmm_mpi_Recv(0, 0, MPI_COMM_WORLD, &wbuf, &wn, &(cfg->abc), &hmm) == eslOK) { if (esl_opt_GetBoolean(go, "--recal")) { if (( status = recalibrate_model(go, cfg, errbuf, hmm)) != eslOK) goto CLEANERROR; } if ((status = process_workunit(go, cfg, errbuf, hmm, xv, av)) != eslOK) goto CLEANERROR; pos = 0; MPI_Pack(&status, 1, MPI_INT, wbuf, wn, &pos, MPI_COMM_WORLD); MPI_Pack(xv, cfg->N, MPI_DOUBLE, wbuf, wn, &pos, MPI_COMM_WORLD); if (esl_opt_GetBoolean(go, "-a")) MPI_Pack(av, cfg->N, MPI_INT, wbuf, wn, &pos, MPI_COMM_WORLD); MPI_Send(wbuf, pos, MPI_PACKED, 0, 0, MPI_COMM_WORLD); p7_hmm_Destroy(hmm); } free(wbuf); free(xv); if (av != NULL) free(av); return; CLEANERROR: pos = 0; MPI_Pack(&status, 1, MPI_INT, wbuf, wn, &pos, MPI_COMM_WORLD); MPI_Pack(errbuf, eslERRBUFSIZE, MPI_CHAR, wbuf, wn, &pos, MPI_COMM_WORLD); MPI_Send(wbuf, pos, MPI_PACKED, 0, 0, MPI_COMM_WORLD); if (wbuf != NULL) free(wbuf); if (hmm != NULL) p7_hmm_Destroy(hmm); if (xv != NULL) free(xv); if (av != NULL) free(av); return; ERROR: p7_Fail("Allocation error in mpi_worker"); }
/* Function: CreateCP9Matrix() * based on CreatePlan7Matrix() <-- this function's comments below * Purpose: Create a dynamic programming matrix for standard Forward, * Backward, or Viterbi, with scores kept as scaled log-odds * integers. Keeps 2D arrays compact in RAM in an attempt * to maximize cache hits. * * The mx structure can be dynamically grown, if a new * HMM or seq exceeds the currently allocated size. Dynamic * growing is more efficient than an alloc/free of a whole * matrix for every new target. The ResizePlan7Matrix() * call does this reallocation, if needed. Here, in the * creation step, we set up some pads - to inform the resizing * call how much to overallocate when it realloc's. * * Args: N - N+1 rows are allocated, usually N == 1 for * scanning in memory efficient mode, or N == L, length of sequence. * M - size of model in nodes * * Return: mx * mx is allocated here. Caller frees with FreeCP9Matrix(mx). */ CP9_MX * CreateCP9Matrix(int N, int M) { int status; CP9_MX *mx; int i; ESL_ALLOC(mx, sizeof(CP9_MX)); ESL_ALLOC(mx->mmx, sizeof(int *) * (N+1)); ESL_ALLOC(mx->imx, sizeof(int *) * (N+1)); ESL_ALLOC(mx->dmx, sizeof(int *) * (N+1)); ESL_ALLOC(mx->elmx,sizeof(int *) * (N+1)); /* slightly wasteful, some nodes can't go to EL (for ex: right half of MATPs) */ ESL_ALLOC(mx->erow, sizeof(int) * (N+1)); ESL_ALLOC(mx->mmx_mem, sizeof(int) * ((N+1)*(M+1))); ESL_ALLOC(mx->imx_mem, sizeof(int) * ((N+1)*(M+1))); ESL_ALLOC(mx->dmx_mem, sizeof(int) * ((N+1)*(M+1))); ESL_ALLOC(mx->elmx_mem,sizeof(int) * ((N+1)*(M+1))); /* The indirect assignment below looks wasteful; it's actually * used for aligning data on 16-byte boundaries as a cache * optimization in the fast altivec implementation */ mx->mmx[0] = (int *) mx->mmx_mem; mx->imx[0] = (int *) mx->imx_mem; mx->dmx[0] = (int *) mx->dmx_mem; mx->elmx[0]= (int *) mx->elmx_mem; for (i = 1; i <= N; i++) { mx->mmx[i] = mx->mmx[0] + (i*(M+1)); mx->imx[i] = mx->imx[0] + (i*(M+1)); mx->dmx[i] = mx->dmx[0] + (i*(M+1)); mx->elmx[i]= mx->elmx[0]+ (i*(M+1)); } mx->M = M; mx->rows = N; mx->kmin = NULL; mx->kmax = NULL; mx->ncells_allocated = (M+1) * (N+1); mx->ncells_valid = (M+1) * (N+1); mx->size_Mb = (float) sizeof(CP9_MX); mx->size_Mb += (float) (sizeof(int *) * (mx->rows+1) * 4); /* mx->*mx ptrs */ mx->size_Mb += (float) (sizeof(int) * (mx->rows+1) * (M+1) * 4); /* mx->*mx_mem */ mx->size_Mb += (float) (sizeof(int) * (mx->rows+1)); /* mx->erow */ mx->size_Mb /= 1000000.; return mx; ERROR: cm_Fail("Memory allocation error."); return NULL; /* never reached */ }
/* map_sub_msas * * msa1 and msa2 contain the same named sequences, msa1 contains a superset * of the columns in msa2. Determine which of the msa1 columns the msa2 * columns correspond to. */ static int map_sub_msas(const ESL_GETOPTS *go, char *errbuf, ESL_MSA *msa1, ESL_MSA *msa2, char **ret_msa1_to_msa2_mask) { int status; int apos1, apos2; /* counters over alignment position in msa1, msa2 respectively */ int i; int *msa1_to_msa2_map; /* [0..apos1..msa1->alen] msa2 alignment position that apos1 corresponds to */ char *mask; /* contract check */ if(! (msa1->flags & eslMSA_DIGITAL)) ESL_FAIL(eslEINVAL, errbuf, "in map_sub_msas() msa1 (%s) not digitized.\n", esl_opt_GetArg(go, 1)); if(! (msa2->flags & eslMSA_DIGITAL)) ESL_FAIL(eslEINVAL, errbuf, "in map_sub_msas() msa2 (%s) not digitized.\n", esl_opt_GetString(go, "--submap")); if(msa1->alen <= msa2->alen) ESL_FAIL(eslEINVAL, errbuf, "in map_sub_msas() alignment length for msa1 (%" PRId64 "d) <= length for msa2 (%" PRId64 ")\n", msa1->alen, msa2->alen); ESL_ALLOC(mask, sizeof(char) * (msa1->alen+1)); for(apos1 = 0; apos1 < msa1->alen; apos1++) mask[apos1] = '0'; mask[msa1->alen] = '\0'; ESL_ALLOC(msa1_to_msa2_map, sizeof(int) * (msa1->alen+1)); esl_vec_ISet(msa1_to_msa2_map, (msa1->alen+1), -1); /* both alignments must have same 'named' sequences in same order */ if(msa1->nseq != msa2->nseq) ESL_FAIL(eslEINVAL, errbuf, "in map_sub_msas() msa1 has %d sequences, msa2 has %d sequences\n", msa1->nseq, msa2->nseq); for(i = 0; i < msa1->nseq; i++) { if(strcmp(msa1->sqname[i], msa2->sqname[i]) != 0) ESL_FAIL(eslEINVAL, errbuf, "in map_sub_msas() msa1 seq %d is named %s, msa2 seq %d is named %s\n", i, msa1->sqname[i], i, msa2->sqname[i]); } apos1 = 1; apos2 = 1; while((apos2 <= msa2->alen) || (apos1 <= msa1->alen)) { /* determine which apos1 (alignment column in msa1), apos2 (alignment column in msa2) corresponds to */ for(i = 0; i < msa1->nseq; i++) { if(msa1->ax[i][apos1] != msa2->ax[i][apos2]) { apos1++; break; /* try next apos1 */ } } if(i == msa1->nseq) { /* found a match */ msa1_to_msa2_map[apos1] = apos2; mask[(apos1-1)] = '1'; apos1++; apos2++; } } if((apos1 != (msa1->alen+1)) || (apos2 != (msa2->alen+1))) ESL_FAIL(eslEINVAL, errbuf, "in map_sub_msas(), failure mapping alignments, end of loop apos1-1 = %d (msa1->alen: %" PRId64 ") and apos2-1 = %d (msa2->alen: %" PRId64 ")\n", apos1-1, msa1->alen, apos2-1, msa2->alen); free(msa1_to_msa2_map); *ret_msa1_to_msa2_mask = mask; return eslOK; ERROR: return status; }
static int a2m_padding_digital(ESL_MSA *msa, char **csflag, int *nins, int ncons) { ESL_DSQ *ax = NULL; /* new aligned sequence - will be swapped into msa->ax[] */ ESL_DSQ gapsym = esl_abc_XGetGap(msa->abc); int apos, cpos, spos; /* position counters for alignment 0..alen, consensus cols 0..cpos-1, sequence position 0..slen-1 */ int alen; int icount; int idx; int status; alen = ncons; for (cpos = 0; cpos <= ncons; cpos++) alen += nins[cpos]; ESL_ALLOC(msa->rf, sizeof(char) * (alen+1)); for (apos = 0, cpos = 0; cpos <= ncons; cpos++) { for (icount = 0; icount < nins[cpos]; icount++) msa->rf[apos++] = '.'; if (cpos < ncons) msa->rf[apos++] = 'x'; } msa->rf[apos] = '\0'; for (idx = 0; idx < msa->nseq; idx++) { ESL_ALLOC(ax, sizeof(ESL_DSQ) * (alen + 2)); ax[0] = eslDSQ_SENTINEL; apos = spos = 0; for (cpos = 0; cpos <= ncons; cpos++) { icount = 0; while (csflag[idx][spos] == FALSE) { ax[apos+1] = msa->ax[idx][spos+1]; apos++; spos++; icount++; } while (icount < nins[cpos]) { ax[apos+1] = gapsym; apos++; icount++; } if (cpos < ncons) { ax[apos+1] = msa->ax[idx][spos+1]; apos++; spos++; } } ESL_DASSERT1( (msa->ax[idx][spos+1] == eslDSQ_SENTINEL) ); ESL_DASSERT1( (apos == alen) ); ax[alen+1] = eslDSQ_SENTINEL; free(msa->ax[idx]); msa->ax[idx] = ax; ax = NULL; } msa->alen = alen; return eslOK; ERROR: if (ax) free(ax); return status; }
/* Function: p7_ViterbiMu() * Synopsis: Determines the local Viterbi Gumbel mu parameter for a model. * Incept: SRE, Tue May 19 10:26:19 2009 [Janelia] * * Purpose: Identical to p7_MSVMu(), above, except that it fits * Viterbi scores instead of MSV scores. * * The difference between the two mus is small, but can be * up to ~1 bit or so for large, low-info models [J4/126] so * decided to calibrate the two mus separately [J5/8]. * * Args: r : source of random numbers * om : score profile (length config is changed upon return!) * bg : null model (length config is changed upon return!) * L : length of sequences to simulate * N : number of sequences to simulate * lambda : known Gumbel lambda parameter * ret_vmu : RETURN: ML estimate of location param mu * * Returns: <eslOK> on success, and <ret_mu> contains the ML estimate * of $\mu$. * * Throws: (no abnormal error conditions) */ int p7_ViterbiMu(ESL_RANDOMNESS *r, P7_OPROFILE *om, P7_BG *bg, int L, int N, double lambda, double *ret_vmu) { P7_OMX *ox = p7_omx_Create(om->M, 0, 0); /* DP matrix: 1 row version */ ESL_DSQ *dsq = NULL; double *xv = NULL; int i; float sc, nullsc; #ifndef p7_IMPL_DUMMY float maxsc = (32767.0 - om->base_w) / om->scale_w; /* if score overflows, use this [J4/139] */ #endif int status; if (ox == NULL) { status = eslEMEM; goto ERROR; } ESL_ALLOC(xv, sizeof(double) * N); ESL_ALLOC(dsq, sizeof(ESL_DSQ) * (L+2)); p7_oprofile_ReconfigLength(om, L); p7_bg_SetLength(bg, L); for (i = 0; i < N; i++) { if ((status = esl_rsq_xfIID(r, bg->f, om->abc->K, L, dsq)) != eslOK) goto ERROR; if ((status = p7_bg_NullOne(bg, dsq, L, &nullsc)) != eslOK) goto ERROR; status = p7_ViterbiFilter(dsq, L, om, ox, &sc); #ifndef p7_IMPL_DUMMY if (status == eslERANGE) { sc = maxsc; status = eslOK; } #endif if (status != eslOK) goto ERROR; xv[i] = (sc - nullsc) / eslCONST_LOG2; } if ((status = esl_gumbel_FitCompleteLoc(xv, N, lambda, ret_vmu)) != eslOK) goto ERROR; p7_omx_Destroy(ox); free(xv); free(dsq); return eslOK; ERROR: *ret_vmu = 0.0; if (ox != NULL) p7_omx_Destroy(ox); if (xv != NULL) free(xv); if (dsq != NULL) free(dsq); return status; }
/* Step 1. Label all sequence fragments < fragfrac of average raw length */ static int remove_fragments(struct cfg_s *cfg, ESL_MSA *msa, ESL_MSA **ret_filteredmsa, int *ret_nfrags) { int *useme = NULL; double len = 0.0; int i; int nfrags; int status; for (i = 0; i < msa->nseq; i++) len += esl_abc_dsqrlen(msa->abc, msa->ax[i]); len *= cfg->fragfrac / (double) msa->nseq; ESL_ALLOC(useme, sizeof(int) * msa->nseq); for (nfrags = 0, i = 0; i < msa->nseq; i++) useme[i] = (esl_abc_dsqrlen(msa->abc, msa->ax[i]) < len) ? 0 : 1; if ((status = esl_msa_SequenceSubset(msa, useme, ret_filteredmsa)) != eslOK) goto ERROR; *ret_nfrags = msa->nseq - esl_vec_ISum(useme, msa->nseq); free(useme); return eslOK; ERROR: if (useme != NULL) free(useme); *ret_filteredmsa = NULL; return status; }
/* Function: p7_checkptmx_DumpFBRow() * Synopsis: Dump one row from fwd or bck version of the matrix. * * Purpose: Dump current row <dpc> of forward or backward calculations from * DP matrix <ox> for diagnostics. The index <rowi> is used * as a row label, along with an additional free-text label * <pfx>. (The checkpointed backward implementation * interleaves backward row calculations with recalculated * fwd rows, both of which it is dumping; they need to be * labeled something like "fwd" and "bck" to distinguish * them in the debugging dump.) */ int p7_checkptmx_DumpFBRow(P7_CHECKPTMX *ox, int rowi, __m128 *dpc, char *pfx) { union { __m128 v; float x[p7_VNF]; } u; float *v = NULL; /* */ int Q = ox->Qf; int M = ox->M; float *xc = (float *) (dpc + Q*p7C_NSCELLS); int logify = (ox->dump_flags & p7_SHOW_LOG) ? TRUE : FALSE; int maxpfx = ox->dump_maxpfx; int width = ox->dump_width; int precision = ox->dump_precision; int k,q,z; int status; ESL_ALLOC(v, sizeof(float) * ( (Q*p7_VNF) + 1)); v[0] = 0.; /* Line 1. M cells: unpack, unstripe, print */ for (q = 0; q < Q; q++) { u.v = P7C_MQ(dpc, q); for (z = 0; z < p7_VNF; z++) v[q+Q*z+1] = u.x[z]; } fprintf(ox->dfp, "%*s %3d M", maxpfx, pfx, rowi); for (k = 0; k <= M; k++) fprintf(ox->dfp, " %*.*f", width, precision, (logify ? esl_logf(v[k]) : v[k])); /* a static analyzer may complain about v[k] being uninitialized * if it isn't smart enough to see that M,Q are linked. */ /* Line 1 end: Specials */ for (z = 0; z < p7C_NXCELLS; z++) fprintf(ox->dfp, " %*.*f", width, precision, (logify ? esl_logf(xc[z]) : xc[z])); fputc('\n', ox->dfp); /* Line 2: I cells: unpack, unstripe, print */ for (q = 0; q < Q; q++) { u.v = P7C_IQ(dpc, q); for (z = 0; z < p7_VNF; z++) v[q+Q*z+1] = u.x[z]; } fprintf(ox->dfp, "%*s %3d I", maxpfx, pfx, rowi); for (k = 0; k <= M; k++) fprintf(ox->dfp, " %*.*f", width, precision, (logify ? esl_logf(v[k]) : v[k])); fputc('\n', ox->dfp); /* Line 3. D cells: unpack, unstripe, print */ for (q = 0; q < Q; q++) { u.v = P7C_DQ(dpc, q); for (z = 0; z < p7_VNF; z++) v[q+Q*z+1] = u.x[z]; } fprintf(ox->dfp, "%*s %3d D", maxpfx, pfx, rowi); for (k = 0; k <= M; k++) fprintf(ox->dfp, " %*.*f", width, precision, (logify ? esl_logf(v[k]) : v[k])); fputc('\n', ox->dfp); fputc('\n', ox->dfp); free(v); return eslOK; ERROR: if (v) free(v); return status; }
/* Function: p7_prior_CreateLaplace() * Synopsis: Creates Laplace plus-one prior. * Incept: SRE, Sat Jun 30 09:48:13 2007 [Janelia] * * Purpose: Create a Laplace plus-one prior for alphabet <abc>. */ P7_PRIOR * p7_prior_CreateLaplace(const ESL_ALPHABET *abc) { P7_PRIOR *pri = NULL; int status; ESL_ALLOC(pri, sizeof(P7_PRIOR)); pri->tm = pri->ti = pri->td = pri->em = pri->ei = NULL; pri->tm = esl_mixdchlet_Create(1, 3); /* single component; 3 params */ pri->ti = esl_mixdchlet_Create(1, 2); /* single component; 2 params */ pri->td = esl_mixdchlet_Create(1, 2); /* single component; 2 params */ pri->em = esl_mixdchlet_Create(1, abc->K); /* single component; K params */ pri->ei = esl_mixdchlet_Create(1, abc->K); /* single component; K params */ if (pri->tm == NULL || pri->ti == NULL || pri->td == NULL || pri->em == NULL || pri->ei == NULL) goto ERROR; pri->tm->pq[0] = 1.0; esl_vec_DSet(pri->tm->alpha[0], 3, 1.0); /* match transitions */ pri->ti->pq[0] = 1.0; esl_vec_DSet(pri->ti->alpha[0], 2, 1.0); /* insert transitions */ pri->td->pq[0] = 1.0; esl_vec_DSet(pri->td->alpha[0], 2, 1.0); /* delete transitions */ pri->em->pq[0] = 1.0; esl_vec_DSet(pri->em->alpha[0], abc->K, 1.0); /* match emissions */ pri->ei->pq[0] = 1.0; esl_vec_DSet(pri->ei->alpha[0], abc->K, 1.0); /* insert emissions */ return pri; ERROR: p7_prior_Destroy(pri); return NULL; }
/* Function: p7_hmm_ScoreDataCreate() * Synopsis: Create a <P7_SCOREDATA> model object, based on MSV-filter * part of profile * * Purpose: Allocate a <P7_SCOREDATA> object, then populate * it with data based on the given optimized matrix. * * Once a hit passes the MSV filter, and the prefix/suffix * values of P7_SCOREDATA are required, p7_hmm_ScoreDataComputeRest() * must be called. * * Args: om - P7_OPROFILE containing scores used to produce SCOREDATA contents * do_opt_ext - boolean, TRUE if optimal-extension scores are required (for FM-MSV) * * Returns: a pointer to the new <P7_SCOREDATA> object. * * Throws: <NULL> on allocation failure. */ P7_SCOREDATA * p7_hmm_ScoreDataCreate(P7_OPROFILE *om, P7_PROFILE *gm ) { P7_SCOREDATA *data = NULL; int status; ESL_ALLOC(data, sizeof(P7_SCOREDATA)); data->ssv_scores = NULL; data->ssv_scores_f = NULL; data->opt_ext_fwd = NULL; data->opt_ext_rev = NULL; data->prefix_lengths = NULL; data->suffix_lengths = NULL; data->fwd_scores = NULL; data->fwd_transitions = NULL; scoredata_GetSSVScoreArrays(om, gm, data); return data; ERROR: p7_hmm_ScoreDataDestroy(data); return NULL; }
/* Function: p7_Handmodelmaker() * * Purpose: Manual model construction. * Construct an HMM from a digital alignment, where the * <#=RF> line of the alignment file is used to indicate the * columns assigned to matches vs. inserts. * * The <msa> must be in digital mode, and it must have * a reference annotation line. * * NOTE: <p7_Handmodelmaker()> will slightly revise the * alignment if necessary, if the assignment of columns * implies DI and ID transitions. * * Returns both the HMM in counts form (ready for applying * Dirichlet priors as the next step), and fake tracebacks * for each aligned sequence. * * Models must have at least one node, so if the <msa> defined * no consensus columns, a <eslENORESULT> error is returned. * * Args: msa - multiple sequence alignment * bld - holds information on regions requiring masking, optionally NULL -> no masking * ret_hmm - RETURN: counts-form HMM * opt_tr - optRETURN: array of tracebacks for aseq's * * Return: <eslOK> on success. <ret_hmm> and <opt_tr> are allocated * here, and must be free'd by caller. * * Returns <eslENORESULT> if no consensus columns were annotated; * in this case, <ret_hmm> and <opt_tr> are returned NULL. * * Returns <eslEFORMAT> if the <msa> doesn't have a reference * annotation line. * * Throws: <eslEMEM> on allocation failure. Throws <eslEINVAL> if the <msa> * isn't in digital mode. */ int p7_Handmodelmaker(ESL_MSA *msa, P7_BUILDER *bld, P7_HMM **ret_hmm, P7_TRACE ***opt_tr) { int status; int *matassign = NULL; /* MAT state assignments if 1; 1..alen */ int apos; /* counter for aligned columns */ if (! (msa->flags & eslMSA_DIGITAL)) ESL_XEXCEPTION(eslEINVAL, "need a digital msa"); if (msa->rf == NULL) return eslEFORMAT; ESL_ALLOC(matassign, sizeof(int) * (msa->alen+1)); /* Watch for off-by-one. rf is [0..alen-1]; matassign is [1..alen] */ for (apos = 1; apos <= msa->alen; apos++) matassign[apos] = (esl_abc_CIsGap(msa->abc, msa->rf[apos-1])? FALSE : TRUE); /* matassign2hmm leaves ret_hmm, opt_tr in their proper state: */ if ((status = matassign2hmm(msa, matassign, ret_hmm, opt_tr)) != eslOK) goto ERROR; free(matassign); return eslOK; ERROR: if (matassign != NULL) free(matassign); return status; }
/* read_mask_file * * Given an open file pointer, read the first token of the * file and return it as *ret_mask. It must contain only * '0' or '1' characters. * * Returns: eslOK on success. */ int read_mask_file(char *filename, char *errbuf, char **ret_mask, int *ret_mask_len) { ESL_FILEPARSER *efp = NULL; char *mask = NULL; char *tok; int toklen; int n; int status; if (esl_fileparser_Open(filename, NULL, &efp) != eslOK) ESL_XFAIL(eslFAIL, errbuf, "failed to open %s in read_mask_file\n", filename); esl_fileparser_SetCommentChar(efp, '#'); if((status = esl_fileparser_GetToken(efp, &tok, &toklen)) != eslOK) ESL_XFAIL(eslFAIL, errbuf, "failed to read a single token from %s\n", filename); ESL_ALLOC(mask, sizeof(char) * (toklen+1)); for(n = 0; n < toklen; n++) { if((tok[n] == '0') || (tok[n] == '1')) { mask[n] = tok[n]; } else { ESL_XFAIL(eslFAIL, errbuf, "read a non-0 and non-1 character (%c) in the mask file %s\n", tok[n], filename); } } mask[n] = '\0'; *ret_mask = mask; *ret_mask_len = n; esl_fileparser_Close(efp); return eslOK; ERROR: if (efp) esl_fileparser_Close(efp); if (mask) free(mask); return status; }
/* sample_endpoints() * Incept: SRE, Mon Jan 22 10:43:20 2007 [Janelia] * * Purpose: Given a profile <gm> and random number source <r>, sample * a begin transition from the implicit probabilistic profile * model, yielding a sampled start and end node; return these * via <ret_kstart> and <ret_kend>. * * By construction, the entry at node <kstart> is into a * match state, but the exit from node <kend> might turn * out to be from either a match or delete state. * * We assume that exits j are uniformly distributed for a * particular entry point i: $a_{ij} =$ constant $\forall * j$. * * Returns: <eslOK> on success. * * Throws: <eslEMEM> on allocation error. * * Xref: STL11/138 */ static int sample_endpoints(ESL_RANDOMNESS *r, const P7_PROFILE *gm, int *ret_kstart, int *ret_kend) { float *pstart = NULL; int k; int kstart, kend; int status; /* We have to backcalculate a probability distribution from the * lod B->Mk scores in a local model; this is a little time consuming, * but we don't have to do it often. */ ESL_ALLOC(pstart, sizeof(float) * (gm->M+1)); pstart[0] = 0.0f; for (k = 1; k <= gm->M; k++) pstart[k] = exp(p7P_TSC(gm, k-1, p7P_BM)) * (gm->M - k + 1); /* multiply p_ij by the number of exits j */ kstart = esl_rnd_FChoose(r, pstart, gm->M+1); /* sample the starting position from that distribution */ kend = kstart + esl_rnd_Roll(r, gm->M-kstart+1); /* and the exit uniformly from possible exits for it */ free(pstart); *ret_kstart = kstart; *ret_kend = kend; return eslOK; ERROR: if (pstart != NULL) free(pstart); *ret_kstart = 0; *ret_kend = 0; return status; }
/* Function: cm_alndata_Create() * Synopsis: Allocate a CM_ALNDATA object. * Incept: EPN, Fri Jan 6 09:01:33 2012 * * Purpose: Allocates a new <CM_ALNDATA> and returns a pointer * to it. * * Throws: <NULL> on allocation failure. */ CM_ALNDATA * cm_alndata_Create(void) { int status; CM_ALNDATA *data = NULL; ESL_ALLOC(data, sizeof(CM_ALNDATA)); data->sq = NULL; data->idx = -1; data->tr = NULL; data->sc = 0.; data->pp = 0.; data->ppstr = NULL; data->spos = -1; data->epos = -1; data->secs_bands = 0.; data->secs_aln = 0.; data->mb_tot = 0.; data->tau = -1.; return data; ERROR: return NULL; }
static int a2m_padding_text(ESL_MSA *msa, char **csflag, int *nins, int ncons) { char *aseq = NULL; /* new aligned sequence - will be swapped into msa->aseq[] */ int apos, cpos, spos; /* position counters for alignment 0..alen, consensus cols 0..cpos-1, sequence position 0..slen-1 */ int alen; int icount; int idx; int status; alen = ncons; for (cpos = 0; cpos <= ncons; cpos++) alen += nins[cpos]; ESL_ALLOC(msa->rf, sizeof(char) * (alen+1)); for (apos = 0, cpos = 0; cpos <= ncons; cpos++) { for (icount = 0; icount < nins[cpos]; icount++) msa->rf[apos++] = '.'; if (cpos < ncons) msa->rf[apos++] = 'x'; } msa->rf[apos] = '\0'; for (idx = 0; idx < msa->nseq; idx++) { ESL_ALLOC(aseq, sizeof(char) * (alen + 1)); apos = spos = 0; for (cpos = 0; cpos <= ncons; cpos++) { icount = 0; while (csflag[idx][spos] == FALSE) { aseq[apos] = msa->aseq[idx][spos]; apos++; spos++; icount++; } while (icount < nins[cpos]) { aseq[apos] = '.'; apos++; icount++; } if (cpos < ncons) { aseq[apos] = msa->aseq[idx][spos]; apos++; spos++; } } ESL_DASSERT1( (msa->aseq[idx][spos] == '\0') ); ESL_DASSERT1( (apos == alen) ); aseq[alen] = '\0'; free(msa->aseq[idx]); msa->aseq[idx] = aseq; aseq = NULL; } msa->alen = alen; return eslOK; ERROR: if (aseq) free(aseq); return status; }