/* Function: p7_prior_CreateLaplace() * Synopsis: Creates Laplace plus-one prior. * Incept: SRE, Sat Jun 30 09:48:13 2007 [Janelia] * * Purpose: Create a Laplace plus-one prior for alphabet <abc>. */ P7_PRIOR * p7_prior_CreateLaplace(const ESL_ALPHABET *abc) { P7_PRIOR *pri = NULL; int status; ESL_ALLOC(pri, sizeof(P7_PRIOR)); pri->tm = pri->ti = pri->td = pri->em = pri->ei = NULL; pri->tm = esl_mixdchlet_Create(1, 3); /* single component; 3 params */ pri->ti = esl_mixdchlet_Create(1, 2); /* single component; 2 params */ pri->td = esl_mixdchlet_Create(1, 2); /* single component; 2 params */ pri->em = esl_mixdchlet_Create(1, abc->K); /* single component; K params */ pri->ei = esl_mixdchlet_Create(1, abc->K); /* single component; K params */ if (pri->tm == NULL || pri->ti == NULL || pri->td == NULL || pri->em == NULL || pri->ei == NULL) goto ERROR; pri->tm->pq[0] = 1.0; esl_vec_DSet(pri->tm->alpha[0], 3, 1.0); /* match transitions */ pri->ti->pq[0] = 1.0; esl_vec_DSet(pri->ti->alpha[0], 2, 1.0); /* insert transitions */ pri->td->pq[0] = 1.0; esl_vec_DSet(pri->td->alpha[0], 2, 1.0); /* delete transitions */ pri->em->pq[0] = 1.0; esl_vec_DSet(pri->em->alpha[0], abc->K, 1.0); /* match emissions */ pri->ei->pq[0] = 1.0; esl_vec_DSet(pri->ei->alpha[0], abc->K, 1.0); /* insert emissions */ return pri; ERROR: p7_prior_Destroy(pri); return NULL; }
/* seq_generation() * * Generating sequences. */ static int seq_generation(ESL_GETOPTS *go, ESL_RANDOMNESS *r, FILE *ofp, int outfmt) { ESL_ALPHABET *abc = NULL; ESL_SQ *sq = NULL; double *fq = NULL; int alphatype = eslUNKNOWN; // static checkers can't see that 1 of --rna, --dna, --amino must be true int N = esl_opt_GetInteger(go, "-N"); int L = esl_opt_GetInteger(go, "-L"); int i; int status; if (L <= 0) esl_fatal("To generate sequences, set -L option (length of generated seqs) > 0 "); if (esl_opt_GetBoolean(go, "--rna")) alphatype = eslRNA; if (esl_opt_GetBoolean(go, "--dna")) alphatype = eslDNA; if (esl_opt_GetBoolean(go, "--amino")) alphatype = eslAMINO; abc = esl_alphabet_Create(alphatype); sq = esl_sq_CreateDigital(abc); esl_sq_GrowTo(sq, L); /* Pick the iid frequency distribution to use */ ESL_ALLOC(fq, sizeof(double) * abc->K); switch (alphatype) { case eslRNA: case eslDNA: esl_vec_DSet(fq, 4, 0.25); break; case eslAMINO: esl_composition_SW34(fq); break; default: esl_vec_DSet(fq, abc->K, 1.0 / (double) abc->K); break; } /* generate */ for (i = 0; i < N; i++) { esl_rsq_xIID(r, fq, abc->K, L, sq->dsq); if (N > 1) esl_sq_FormatName(sq, "random%d", i); else esl_sq_SetName(sq, "random"); sq->n = L; esl_sqio_Write(ofp, sq, outfmt, FALSE); } free(fq); esl_alphabet_Destroy(abc); esl_sq_Destroy(sq); return eslOK; ERROR: if (fq != NULL) free(fq); esl_alphabet_Destroy(abc); esl_sq_Destroy(sq); return status; }
/* The gradient of the NLL w.r.t. each free parameter in p. */ static void hyperexp_complete_gradient(double *p, int np, void *dptr, double *dp) { struct hyperexp_data *data = (struct hyperexp_data *) dptr; ESL_HYPEREXP *h = data->h; double pdf; int i,k; int pidx; hyperexp_unpack_paramvector(p, np, h); esl_vec_DSet(dp, np, 0.); for (i = 0; i < data->n; i++) { /* FIXME: I think the calculation below may need to be done * in log space, to avoid underflow errors; see complete_binned_gradient() */ /* Precalculate q_k PDF_k(x) terms, and their sum */ for (k = 0; k < h->K; k++) h->wrk[k] = h->q[k] * esl_exp_pdf(data->x[i], h->mu, h->lambda[k]); pdf = esl_vec_DSum(h->wrk, h->K); pidx = 0; if (! h->fixmix) { for (k = 1; k < h->K; k++) /* generic d/dQ solution for mixture models */ dp[pidx++] -= h->wrk[k]/pdf - h->q[k]; } for (k = 0; k < h->K; k++) if (! h->fixlambda[k]) dp[pidx++] -= (1.-h->lambda[k]*(data->x[i]-h->mu))*h->wrk[k]/pdf; /* d/dw */ } }
static void hyperexp_complete_binned_gradient(double *p, int np, void *dptr, double *dp) { struct hyperexp_binned_data *data = (struct hyperexp_binned_data *) dptr; ESL_HISTOGRAM *g = data->g; ESL_HYPEREXP *h = data->h; int i,k; int pidx; double z; double tmp; double ai, delta; hyperexp_unpack_paramvector(p, np, h); esl_vec_DSet(dp, np, 0.); delta = g->w; /* counting over occupied, uncensored histogram bins */ for (i = g->cmin; i <= g->imax; i++) { if (g->obs[i] == 0) continue; ai = esl_histogram_Bin2LBound(g, i); if (ai < h->mu) ai = h->mu; /* careful about the left boundary: no x < h->mu */ /* Calculate log (q_m alpha_m(a_i) terms */ for (k = 0; k < h->K; k++) { h->wrk[k] = log(h->q[k]) - h->lambda[k]*(ai-h->mu); if (delta * h->lambda[k] < eslSMALLX1) h->wrk[k] += log(delta * h->lambda[k]); else h->wrk[k] += log(1 - exp(-delta * h->lambda[k])); } z = esl_vec_DLogSum(h->wrk, h->K); /* z= log \sum_k q_k alpha_k(a_i) */ /* Bump the gradients for Q_1..Q_{K-1} */ pidx = 0; if (! h->fixmix) { for (k = 1; k < h->K; k++) dp[pidx++] -= g->obs[i] * (exp(h->wrk[k] - z) - h->q[k]); } /* Bump the gradients for w_0..w_{K-1} */ for (k = 0; k < h->K; k++) if (! h->fixlambda[k]) { tmp = log(h->q[k]) + log(h->lambda[k])- h->lambda[k]*(ai-h->mu); tmp = exp(tmp - z); tmp *= (ai + delta - h->mu) * exp(-delta * h->lambda[k]) - (ai - h->mu); dp[pidx++] -= g->obs[i] * tmp; } } }
int main(void) { double *p; char labels[] = "ACGT"; int n = 4; p = malloc(sizeof(double) * n); esl_vec_DSet(p, n, 1.0); esl_vec_DNorm(p, n); esl_vec_DDump(stdout, p, n, labels); free(p); return 0; }
/* set_relative_weights(): * Set msa->wgt vector, using user's choice of relative weighting algorithm. */ static int relative_weights(P7_BUILDER *bld, ESL_MSA *msa) { int status = eslOK; if (bld->wgt_strategy == p7_WGT_NONE) { esl_vec_DSet(msa->wgt, msa->nseq, 1.); } else if (bld->wgt_strategy == p7_WGT_GIVEN) ; else if (bld->wgt_strategy == p7_WGT_PB) status = esl_msaweight_PB(msa); else if (bld->wgt_strategy == p7_WGT_GSC) status = esl_msaweight_GSC(msa); else if (bld->wgt_strategy == p7_WGT_BLOSUM) status = esl_msaweight_BLOSUM(msa, bld->wid); else ESL_EXCEPTION(eslEINCONCEIVABLE, "no such weighting strategy"); if (status != eslOK) ESL_FAIL(status, bld->errbuf, "failed to set relative weights in alignment"); return eslOK; }
/* Function: esl_paml_ReadE() * Incept: SRE, Fri Jul 9 09:27:24 2004 [St. Louis] * * Purpose: Read an amino acid rate matrix in PAML format from stream * <fp>. Return it in two pieces: the symmetric E * exchangeability matrix in <E>, and the stationary * probability vector $\pi$ in <pi>. * Caller provides the memory for both <E> and <pi>. <E> * is a $20 \times 20$ matrix allocated as * <esl_dmatrix_Create(20, 20)>. <pi> is an array with * space for at least 20 doubles. * * The <E> matrix is symmetric for off-diagonal elements: * $E_{ij} = E_{ij}$ for $i \neq j$. The on-diagonal * elements $E_{ii}$ are not valid and should not be * accessed. (They are set to zero.) * The rate matrix will later be obtained from <E> * and <pi> as * $Q_{ij} = E_{ij} \pi_j$ for $i \neq j$ * and * $Q_{ii} = -\sum_{j \neq i} Q_{ij}$ * then scaled to units of one * substitution/site; see <esl_ratemx_E2Q()> and * <esl_ratemx_ScaleTo()>. * * Data file format: First 190 numbers are a * lower-triangular matrix E of amino acid * exchangeabilities $E_{ij}$. Next 20 numbers are the * amino acid frequencies $\pi_i$. Remainder of the * datafile is ignored. * * The alphabet order in the matrix and the frequency * vector is assumed to be "ARNDCQEGHILKMFPSTWYV" * (alphabetical by three-letter code), which appears to be * PAML's default order. This is transformed to Easel's * "ACDEFGHIKLMNPQRSTVWY" (alphabetical by one-letter code) * in the $E_{ij}$ and $\pi_i$ that are returned. * * Args: fp - open datafile for reading. * E - RETURN: E matrix of amino acid exchangeabilities e_ij, * symmetric (E_ij = E_ji), * in Easel amino acid alphabet order A..Y. * Caller provides appropriately allocated space. * pi - RETURN: \pi_i vector of amino acid frequencies, * in Easel amino acid alphabet order A..Y. * Caller provides appropriately allocated space. * * Returns: <eslOK> on success. * Returns <eslEOF> on premature end of file (parse failed), in which * case the contents of <E> and <pi> are undefined. * * Throws: <eslEMEM> on internal allocation failure, * and the contents of <E> and <pi> are undefined. * * Xref: STL8/p.56. */ int esl_paml_ReadE(FILE *fp, ESL_DMATRIX *E, double *pi) { int status; ESL_FILEPARSER *efp = NULL; char *tok; int i,j; char *pamlorder = "ARNDCQEGHILKMFPSTWYV"; char *eslorder = "ACDEFGHIKLMNPQRSTVWY"; int perm[20]; if ((status = esl_dmatrix_SetZero(E)) != eslOK) goto ERROR; esl_vec_DSet(pi, 20, 0.); if ((efp = esl_fileparser_Create(fp)) == NULL) goto ERROR; if ((status = esl_fileparser_SetCommentChar(efp, '#')) != eslOK) goto ERROR; /* Construct the alphabet permutation we need. * perm[i] -> original row/column i goes to row/column perm[i] */ for (i = 0; i < 20; i++) perm[i] = (int) (strchr(eslorder, pamlorder[i]) - eslorder); /* Read the s_ij matrix data in, permuting as we go. */ for (i = 1; i < 20; i++) for (j = 0; j < i; j++) { if ((status = esl_fileparser_GetToken(efp, &tok, NULL)) != eslOK) goto ERROR; E->mx[perm[i]][perm[j]] = atof(tok); E->mx[perm[j]][perm[i]] = E->mx[perm[i]][perm[j]]; } /* Read the pi_i vector in, permuting as we read. */ for (i = 0; i < 20; i++) { if ((status = esl_fileparser_GetToken(efp, &tok, NULL)) != eslOK) goto ERROR; pi[perm[i]] = atof(tok); } esl_fileparser_Destroy(efp); return eslOK; ERROR: if (efp != NULL) esl_fileparser_Destroy(efp); return status; }
static void utest_SetWAG(void) { char errbuf[eslERRBUFSIZE]; ESL_DMATRIX *Q = NULL; ESL_DMATRIX *P = NULL; double t = 50.0; /* sufficiently large to drive e^tQ to stationarity */ double pi[20]; int i; if ((Q = esl_dmatrix_Create(20, 20)) == NULL) esl_fatal("malloc failed"); if ((P = esl_dmatrix_Create(20, 20)) == NULL) esl_fatal("malloc failed"); /* This tests that exponentiating WAG gives a stable conditional * probability matrix solution. (It doesn't particularly test that * WAG was set correctly, but how could we have screwed that up?) */ if (esl_rmx_SetWAG(Q, NULL) != eslOK) esl_fatal("_SetWAG() failed"); if (esl_dmx_Exp(Q, t, P) != eslOK) esl_fatal("matrix exponentiation failed"); if (esl_rmx_ValidateP(P, 1e-7, errbuf) != eslOK) esl_fatal("P validation failed: %s", errbuf); if (esl_rmx_ValidateQ(Q, 1e-7, errbuf) != eslOK) esl_fatal("Q validation failed: %s", errbuf); /* This tests setting WAG to different stationary pi's than default, * then tests that exponentiating to large t reaches those stationaries. */ esl_vec_DSet(pi, 20, 0.05); if (esl_rmx_SetWAG(Q, pi) != eslOK) esl_fatal("_SetWAG() failed"); if (esl_dmx_Exp(Q, t, P) != eslOK) esl_fatal("matrix exponentiation failed"); if (esl_rmx_ValidateP(P, 1e-7, errbuf) != eslOK) esl_fatal("P validation failed: %s", errbuf); if (esl_rmx_ValidateQ(Q, 1e-7, errbuf) != eslOK) esl_fatal("Q validation failed: %s", errbuf); for (i = 0; i < 20; i++) if (esl_vec_DCompare(P->mx[i], pi, 20, 1e-7) != eslOK) esl_fatal("P didn't converge to right pi's"); esl_dmatrix_Destroy(Q); esl_dmatrix_Destroy(P); return; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* command line configuration */ struct cfg_s cfg; /* application configuration */ char *basename= NULL; /* base of the output file names */ char *alifile = NULL; /* alignment file name */ char *dbfile = NULL; /* name of seq db file */ char outfile[256]; /* name of an output file */ int alifmt; /* format code for alifile */ int dbfmt; /* format code for dbfile */ ESL_MSAFILE *afp = NULL; /* open alignment file */ ESL_MSA *origmsa = NULL; /* one multiple sequence alignment */ ESL_MSA *msa = NULL; /* MSA after frags are removed */ ESL_MSA *trainmsa= NULL; /* training set, aligned */ ESL_STACK *teststack=NULL; /* test set: stack of ESL_SQ ptrs */ int status; /* easel return code */ int nfrags; /* # of fragments removed */ int ntestdom; /* # of test domains */ int ntest; /* # of test sequences created */ int nali; /* number of alignments read */ double avgid; /* Parse command line */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) cmdline_failure(argv[0], "Error in app configuration: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h")) cmdline_help(argv[0], go); if (esl_opt_ArgNumber(go) != 3) cmdline_failure(argv[0], "Incorrect number of command line arguments\n"); basename = esl_opt_GetArg(go, 1); alifile = esl_opt_GetArg(go, 2); dbfile = esl_opt_GetArg(go, 3); alifmt = eslMSAFILE_STOCKHOLM; dbfmt = eslSQFILE_FASTA; /* Set up the configuration structure shared amongst functions here */ if (esl_opt_IsDefault(go, "--seed")) cfg.r = esl_randomness_CreateTimeseeded(); else cfg.r = esl_randomness_Create(esl_opt_GetInteger(go, "--seed")); cfg.abc = NULL; /* until we open the MSA file, below */ cfg.fragfrac = esl_opt_GetReal(go, "-F"); cfg.idthresh1 = esl_opt_GetReal(go, "-1"); cfg.idthresh2 = esl_opt_GetReal(go, "-2"); cfg.test_lens = NULL; cfg.ntest = 0; /* Open the output files */ if (snprintf(outfile, 256, "%s.msa", basename) >= 256) esl_fatal("Failed to construct output MSA file name"); if ((cfg.out_msafp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open MSA output file %s\n", outfile); if (snprintf(outfile, 256, "%s.fa", basename) >= 256) esl_fatal("Failed to construct output FASTA file name"); if ((cfg.out_seqfp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open FASTA output file %s\n", outfile); if (snprintf(outfile, 256, "%s.pos", basename) >= 256) esl_fatal("Failed to construct pos test set summary file name"); if ((cfg.possummfp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open pos test set summary file %s\n", outfile); if (snprintf(outfile, 256, "%s.neg", basename) >= 256) esl_fatal("Failed to construct neg test set summary file name"); if ((cfg.negsummfp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open neg test set summary file %s\n", outfile); if (snprintf(outfile, 256, "%s.tbl", basename) >= 256) esl_fatal("Failed to construct benchmark table file name"); if ((cfg.tblfp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open benchmark table file %s\n", outfile); /* Open the MSA file; determine alphabet */ status = esl_msafile_Open(alifile, alifmt, NULL, &afp); if (status == eslENOTFOUND) esl_fatal("Alignment file %s doesn't exist or is not readable\n", alifile); else if (status == eslEFORMAT) esl_fatal("Couldn't determine format of alignment %s\n", alifile); else if (status != eslOK) esl_fatal("Alignment file open failed with error %d\n", status); if (esl_opt_GetBoolean(go, "--amino")) cfg.abc = esl_alphabet_Create(eslAMINO); else if (esl_opt_GetBoolean(go, "--dna")) cfg.abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--rna")) cfg.abc = esl_alphabet_Create(eslRNA); else { int type; status = esl_msafile_GuessAlphabet(afp, &type); if (status == eslEAMBIGUOUS) esl_fatal("Failed to guess the bio alphabet used in %s.\nUse --dna, --rna, or --amino option to specify it.", alifile); else if (status == eslEFORMAT) esl_fatal("Alignment file parse failed: %s\n", afp->errbuf); else if (status == eslENODATA) esl_fatal("Alignment file %s is empty\n", alifile); else if (status != eslOK) esl_fatal("Failed to read alignment file %s\n", alifile); cfg.abc = esl_alphabet_Create(type); } esl_msafile_SetDigital(afp, cfg.abc); if (cfg.abc->type == eslAMINO) esl_composition_SW34(cfg.fq); else esl_vec_DSet(cfg.fq, cfg.abc->K, 1.0 / (double) cfg.abc->K); /* Open and process the dbfile; make sure it's in the same alphabet */ process_dbfile(&cfg, dbfile, dbfmt); /* Read and process MSAs one at a time */ nali = 0; while ((status = esl_msa_Read(afp, &origmsa)) == eslOK) { remove_fragments(&cfg, origmsa, &msa, &nfrags); separate_sets (&cfg, msa, &trainmsa, &teststack); ntestdom = esl_stack_ObjectCount(teststack); if (ntestdom >= 2) { esl_stack_Shuffle(cfg.r, teststack); synthesize_positives(go, &cfg, msa->name, teststack, &ntest); esl_msa_MinimGaps(trainmsa, NULL, NULL); esl_msa_Write(cfg.out_msafp, trainmsa, eslMSAFILE_STOCKHOLM); esl_dst_XAverageId(cfg.abc, trainmsa->ax, trainmsa->nseq, 10000, &avgid); /* 10000 is max_comparisons, before sampling kicks in */ fprintf(cfg.tblfp, "%-20s %3.0f%% %6d %6d %6d %6d %6d %6d\n", msa->name, 100.*avgid, (int) trainmsa->alen, msa->nseq, nfrags, trainmsa->nseq, ntestdom, ntest); nali++; } esl_msa_Destroy(trainmsa); esl_msa_Destroy(origmsa); esl_msa_Destroy(msa); } if (status == eslEFORMAT) esl_fatal("Alignment file parse error, line %d of file %s:\n%s\nOffending line is:\n%s\n", afp->linenumber, afp->fname, afp->errbuf, afp->buf); else if (status != eslEOF) esl_fatal("Alignment file read failed with error code %d\n", status); else if (nali == 0) esl_fatal("No alignments found in file %s\n", alifile); if (nali > 0) synthesize_negatives(go, &cfg, esl_opt_GetInteger(go, "-N")); fclose(cfg.out_msafp); fclose(cfg.out_seqfp); fclose(cfg.possummfp); fclose(cfg.negsummfp); fclose(cfg.tblfp); esl_randomness_Destroy(cfg.r); esl_alphabet_Destroy(cfg.abc); esl_msafile_Close(afp); esl_getopts_Destroy(go); return 0; }
/* Function: p7_prior_CreateNucleic() * * Purpose: Creates the default mixture Dirichlet prior for nucleotide * sequences. * * The transition priors (match, insert, delete) are all * single Dirichlets, trained on a portion of the rmark dataset * * The match emission prior is an eight-component mixture * trained against a portion of the rmark dataset * * The insert emission prior is a single Dirichlet with * high $|\alpha|$, such that insert emission probabilities * are essentially fixed by the prior, regardless of * observed count data. * * Returns: a pointer to the new <P7_PRIOR> structure. */ P7_PRIOR * p7_prior_CreateNucleic(void) { int status; P7_PRIOR *pri = NULL; int q; /* Plus-1 Laplace prior int num_comp = 1; static double defmq[2] = { 1.0 }; static double defm[1][4] = { { 1.0, 1.0, 1.0, 1.0} // }; */ /* Match emission priors are trained on Rmark3 database * Xref: ~wheelert/notebook/2011/0325_nhmmer_new_parameters */ int num_comp = 4; static double defmq[4] = { 0.24, 0.26, 0.08, 0.42 }; static double defm[4][4] = { { 0.16, 0.45, 0.12, 0.39}, { 0.09, 0.03, 0.09, 0.04}, { 1.29, 0.40, 6.58, 0.51}, { 1.74, 1.49, 1.57, 1.95} }; ESL_ALLOC(pri, sizeof(P7_PRIOR)); pri->tm = pri->ti = pri->td = pri->em = pri->ei = NULL; pri->tm = esl_mixdchlet_Create(1, 3); // match transitions; single component; 3 params pri->ti = esl_mixdchlet_Create(1, 2); // insert transitions; single component; 2 params pri->td = esl_mixdchlet_Create(1, 2); // delete transitions; single component; 2 params pri->em = esl_mixdchlet_Create(num_comp, 4); // match emissions; X component; 4 params pri->ei = esl_mixdchlet_Create(1, 4); // insert emissions; single component; 4 params if (pri->tm == NULL || pri->ti == NULL || pri->td == NULL || pri->em == NULL || pri->ei == NULL) goto ERROR; /* Transition priors: roughly, learned from rmark benchmark - hand-beautified (trimming overspecified significant digits) */ pri->tm->pq[0] = 1.0; pri->tm->alpha[0][0] = 2.0; // TMM pri->tm->alpha[0][1] = 0.1; // TMI pri->tm->alpha[0][2] = 0.1; // TMD pri->ti->pq[0] = 1.0; pri->ti->alpha[0][0] = 0.06; // TIM pri->ti->alpha[0][1] = 0.2; // TII pri->td->pq[0] = 1.0; pri->td->alpha[0][0] = 0.1; // TDM pri->td->alpha[0][1] = 0.2; // TDD /* Match emission priors */ for (q = 0; q < num_comp; q++) { pri->em->pq[q] = defmq[q]; esl_vec_DCopy(defm[q], 4, pri->em->alpha[q]); } /* Insert emission priors. Should that alphas be lower? higher? */ pri->ei->pq[0] = 1.0; esl_vec_DSet(pri->ei->alpha[0], 4, 1.0); return pri; ERROR: if (pri != NULL) p7_prior_Destroy(pri); return NULL; }
/* Function: esl_msaweight_PB() * Synopsis: PB (position-based) weights. * Incept: SRE, Sun Nov 5 08:59:28 2006 [Janelia] * * Purpose: Given a multiple alignment <msa>, calculate sequence * weights according to the position-based weighting * algorithm (Henikoff and Henikoff, JMB 243:574-578, * 1994). These weights are stored internally in the <msa> * object, replacing any weights that may have already been * there. Weights are $\geq 0$ and they sum to <msa->nseq>. * * The <msa> may be in either digitized or text mode. * Digital mode is preferred, so that the algorithm * deals with degenerate residue symbols properly. * * The Henikoffs' algorithm does not give rules for dealing * with gaps or degenerate residue symbols. The rule here * is to ignore them. This means that longer sequences * initially get more weight; hence a "double * normalization" in which the weights are first divided by * sequence length in canonical residues (to compensate for * that effect), then normalized to sum to nseq. * * An advantage of the PB method is efficiency. * It is $O(1)$ in memory and $O(NL)$ time, for an alignment of * N sequences and L columns. This makes it a good method * for ad hoc weighting of very deep alignments. * * When the alignment is in simple text mode, IUPAC * degenerate symbols are not dealt with correctly; instead, * the algorithm simply uses the 26 letters as "residues" * (case-insensitively), and treats all other residues as * gaps. * * Returns: <eslOK> on success, and the weights inside <msa> have been * modified. * * Throws: <eslEMEM> on allocation error, in which case <msa> is * returned unmodified. * * Xref: [Henikoff94b]; squid::weight.c::PositionBasedWeights(). */ int esl_msaweight_PB(ESL_MSA *msa) { int *nres = NULL; /* counts of each residue observed in a column */ int ntotal; /* number of different symbols observed in a column */ int rlen; /* number of residues in a sequence */ int idx, pos, i; int K; /* alphabet size */ int status; /* Contract checks */ ESL_DASSERT1( (msa->nseq >= 1) ); ESL_DASSERT1( (msa->alen >= 1) ); if (msa->nseq == 1) { msa->wgt[0] = 1.0; return eslOK; } /* Initialize */ if (! (msa->flags & eslMSA_DIGITAL)) { ESL_ALLOC(nres, sizeof(int) * 26); K = 26; } #ifdef eslAUGMENT_ALPHABET else { ESL_ALLOC(nres, sizeof(int) * msa->abc->K); K = msa->abc->K; } #endif esl_vec_DSet(msa->wgt, msa->nseq, 0.); /* This section handles text alignments */ if (! (msa->flags & eslMSA_DIGITAL)) { for (pos = 0; pos < msa->alen; pos++) { /* Collect # of letters A..Z in this column, and total */ esl_vec_ISet(nres, K, 0.); for (idx = 0; idx < msa->nseq; idx++) if (isalpha((int) msa->aseq[idx][pos])) nres[toupper((int) msa->aseq[idx][pos]) - 'A'] ++; for (ntotal = 0, i = 0; i < K; i++) if (nres[i] > 0) ntotal++; /* Bump weight on each seq by PB rule */ if (ntotal > 0) { for (idx = 0; idx < msa->nseq; idx++) { if (isalpha((int) msa->aseq[idx][pos])) msa->wgt[idx] += 1. / (double) (ntotal * nres[toupper((int) msa->aseq[idx][pos]) - 'A'] ); } } } /* first normalization by # of residues counted in each seq */ for (idx = 0; idx < msa->nseq; idx++) { for (rlen = 0, pos = 0; pos < msa->alen; pos++) if (isalpha((int) msa->aseq[idx][pos])) rlen++; if (ntotal > 0) msa->wgt[idx] /= (double) rlen; /* if rlen == 0 for this seq, its weight is still 0.0, as initialized. */ } } /* This section handles digital alignments. */ #ifdef eslAUGMENT_ALPHABET else { for (pos = 1; pos <= msa->alen; pos++) { /* Collect # of residues 0..K-1 in this column, and total # */ esl_vec_ISet(nres, K, 0.); for (idx = 0; idx < msa->nseq; idx++) if (esl_abc_XIsCanonical(msa->abc, msa->ax[idx][pos])) nres[(int) msa->ax[idx][pos]] ++; for (ntotal = 0, i = 0; i < K; i++) if (nres[i] > 0) ntotal++; /* Bump weight on each sequence by PB rule */ if (ntotal > 0) { for (idx = 0; idx < msa->nseq; idx++) { if (esl_abc_XIsCanonical(msa->abc, msa->ax[idx][pos])) msa->wgt[idx] += 1. / (double) (ntotal * nres[msa->ax[idx][pos]]); } } } /* first normalization by # of residues counted in each seq */ for (idx = 0; idx < msa->nseq; idx++) { for (rlen = 0, pos = 1; pos <= msa->alen; pos++) if (esl_abc_XIsCanonical(msa->abc, msa->ax[idx][pos])) rlen++; if (rlen > 0) msa->wgt[idx] /= (double) rlen; /* if rlen == 0 for this seq, its weight is still 0.0, as initialized. */ } } #endif /* Make weights normalize up to nseq, and return. In pathological * case where all wgts were 0 (no seqs contain any unambiguous * residues), weights become 1.0. */ esl_vec_DNorm(msa->wgt, msa->nseq); esl_vec_DScale(msa->wgt, msa->nseq, (double) msa->nseq); msa->flags |= eslMSA_HASWGTS; free(nres); return eslOK; ERROR: if (nres != NULL) free(nres); return status; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* command line configuration */ struct cfg_s cfg; /* application configuration */ char *basename= NULL; /* base of the output file names */ char *alifile = NULL; /* alignment file name */ char *dbfile = NULL; /* name of seq db file */ char outfile[256]; /* name of an output file */ int alifmt; /* format code for alifile */ int dbfmt; /* format code for dbfile */ ESLX_MSAFILE *afp = NULL; /* open alignment file */ ESL_MSA *origmsa = NULL; /* one multiple sequence alignment */ ESL_MSA *msa = NULL; /* MSA after frags are removed */ ESL_MSA *trainmsa= NULL; /* training set, aligned */ ESL_STACK *teststack=NULL; /* test set: stack of ESL_SQ ptrs */ int status; /* easel return code */ int nfrags; /* # of fragments removed */ int ntestdom; /* # of test domains */ int ntest; /* # of test sequences created */ int nali; /* number of alignments read */ double avgid; /* Parse command line */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) cmdline_failure(argv[0], "Error in app configuration: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h")) cmdline_help(argv[0], go); if (esl_opt_ArgNumber(go) != 3) cmdline_failure(argv[0], "Incorrect number of command line arguments\n"); basename = esl_opt_GetArg(go, 1); alifile = esl_opt_GetArg(go, 2); dbfile = esl_opt_GetArg(go, 3); alifmt = eslMSAFILE_STOCKHOLM; dbfmt = eslSQFILE_FASTA; /* Set up the configuration structure shared amongst functions here */ if (esl_opt_IsDefault(go, "--seed")) cfg.r = esl_randomness_CreateTimeseeded(); else cfg.r = esl_randomness_Create(esl_opt_GetInteger(go, "--seed")); cfg.abc = NULL; /* until we open the MSA file, below */ cfg.fragfrac = esl_opt_GetReal(go, "-F"); cfg.idthresh1 = esl_opt_GetReal(go, "-1"); cfg.idthresh2 = esl_opt_GetReal(go, "-2"); cfg.test_lens = NULL; cfg.ntest = 0; cfg.max_ntest = (esl_opt_IsOn(go, "--maxtest") ? esl_opt_GetInteger(go, "--maxtest") : 0); cfg.max_ntrain = (esl_opt_IsOn(go, "--maxtrain") ? esl_opt_GetInteger(go, "--maxtrain") : 0); /* Open the output files */ if (snprintf(outfile, 256, "%s.msa", basename) >= 256) esl_fatal("Failed to construct output MSA file name"); if ((cfg.out_msafp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open MSA output file %s\n", outfile); if (snprintf(outfile, 256, "%s.fa", basename) >= 256) esl_fatal("Failed to construct output FASTA file name"); if ((cfg.out_seqfp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open FASTA output file %s\n", outfile); if (snprintf(outfile, 256, "%s.pos", basename) >= 256) esl_fatal("Failed to construct pos test set summary file name"); if ((cfg.possummfp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open pos test set summary file %s\n", outfile); if (snprintf(outfile, 256, "%s.neg", basename) >= 256) esl_fatal("Failed to construct neg test set summary file name"); if ((cfg.negsummfp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open neg test set summary file %s\n", outfile); if (snprintf(outfile, 256, "%s.tbl", basename) >= 256) esl_fatal("Failed to construct benchmark table file name"); if ((cfg.tblfp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open benchmark table file %s\n", outfile); if (esl_opt_GetBoolean(go, "--pid")) { if (snprintf(outfile, 256, "%s.pid", basename) >= 256) esl_fatal("Failed to construct %%id table file name"); if ((cfg.pidfp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open %%id table file %s\n", outfile); } else cfg.pidfp = NULL; /* Open the MSA file, digital mode; determine alphabet */ if (esl_opt_GetBoolean(go, "--amino")) cfg.abc = esl_alphabet_Create(eslAMINO); else if (esl_opt_GetBoolean(go, "--dna")) cfg.abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--rna")) cfg.abc = esl_alphabet_Create(eslRNA); status = eslx_msafile_Open(&(cfg.abc), alifile, NULL, alifmt, NULL, &afp); if (status != eslOK) eslx_msafile_OpenFailure(afp, status); if (cfg.abc->type == eslAMINO) esl_composition_SW34(cfg.fq); else esl_vec_DSet(cfg.fq, cfg.abc->K, 1.0 / (double) cfg.abc->K); /* Open and process the dbfile; make sure it's in the same alphabet */ process_dbfile(&cfg, dbfile, dbfmt); /* Read and process MSAs one at a time */ nali = 0; while ((status = eslx_msafile_Read(afp, &origmsa)) != eslEOF) { if (status != eslOK) eslx_msafile_ReadFailure(afp, status); esl_msa_ConvertDegen2X(origmsa); esl_msa_Hash(origmsa); remove_fragments(&cfg, origmsa, &msa, &nfrags); separate_sets (&cfg, msa, &trainmsa, &teststack); if ( esl_stack_ObjectCount(teststack) >= 2) { /* randomize test domain order, and apply size limit if any */ esl_stack_Shuffle(cfg.r, teststack); if (cfg.max_ntest) pstack_select_topn(&teststack, cfg.max_ntest); ntestdom = esl_stack_ObjectCount(teststack); /* randomize training set alignment order, and apply size limit if any */ esl_msashuffle_PermuteSequenceOrder(cfg.r, trainmsa); if (cfg.max_ntrain) msa_select_topn(&trainmsa, cfg.max_ntrain); esl_msa_MinimGaps(trainmsa, NULL, NULL, FALSE); if (esl_opt_GetBoolean(go, "--pid")) write_pids(cfg.pidfp, origmsa, trainmsa, teststack); synthesize_positives(go, &cfg, msa->name, teststack, &ntest); eslx_msafile_Write(cfg.out_msafp, trainmsa, eslMSAFILE_STOCKHOLM); esl_dst_XAverageId(cfg.abc, trainmsa->ax, trainmsa->nseq, 10000, &avgid); /* 10000 is max_comparisons, before sampling kicks in */ fprintf(cfg.tblfp, "%-20s %3.0f%% %6d %6d %6d %6d %6d %6d\n", msa->name, 100.*avgid, (int) trainmsa->alen, msa->nseq, nfrags, trainmsa->nseq, ntestdom, ntest); nali++; } esl_msa_Destroy(trainmsa); esl_msa_Destroy(origmsa); esl_msa_Destroy(msa); } if (nali == 0) esl_fatal("No alignments found in file %s\n", alifile); synthesize_negatives(go, &cfg, esl_opt_GetInteger(go, "-N")); fclose(cfg.out_msafp); fclose(cfg.out_seqfp); fclose(cfg.possummfp); fclose(cfg.negsummfp); fclose(cfg.tblfp); if (cfg.pidfp) fclose(cfg.pidfp); esl_randomness_Destroy(cfg.r); esl_alphabet_Destroy(cfg.abc); eslx_msafile_Close(afp); esl_getopts_Destroy(go); return 0; }
/* dump_insert_info * * Given an MSA with RF annotation, print out information about how many 'insertions' come * after each non-gap RF column (consensus column). */ static int dump_insert_info(FILE *fp, ESL_MSA *msa, int use_weights, int nali, int *i_am_rf, char *alifile, char *errbuf) { int status; int apos, rfpos; double **ict; double *total_ict; int i; int rflen; double seqwt; /* weight of current sequence */ double nseq; /* contract check */ if(! (msa->flags & eslMSA_DIGITAL)) ESL_XFAIL(eslEINVAL, errbuf, "in dump_insert_info(), msa must be digitized."); if(msa->rf == NULL) ESL_XFAIL(eslEINVAL, errbuf, "No #=GC RF markup in alignment, it is needed for --iinfo."); if(i_am_rf == NULL) ESL_XFAIL(eslEINVAL, errbuf, "internal error, dump_insert_info() i_am_rf is NULL."); if(use_weights && msa->wgt == NULL) ESL_FAIL(eslEINCOMPAT, errbuf, "dump_insert_info(): use_weights==TRUE but msa->wgt == NULL"); ESL_ALLOC(total_ict, sizeof(double) * (msa->alen+2)); esl_vec_DSet(total_ict, (msa->alen+2), 0.); ESL_ALLOC(ict, sizeof(double *) * (msa->alen+2)); for(i = 0; i <= msa->alen; i++) { ESL_ALLOC(ict[i], sizeof(double) * (msa->nseq)); esl_vec_DSet(ict[i], (msa->nseq), 0.); } fprintf(fp, "# Insert information:\n"); fprintf(fp, "# Alignment file: %s\n", alifile); fprintf(fp, "# Alignment idx: %d\n", nali); if(msa->name != NULL) { fprintf(fp, "# Alignment name: %s\n", msa->name); } fprintf(fp, "# rfpos is the nongap RF position after which insertions occur\n"); fprintf(fp, "# An rfpos of '0' indicates insertions before the first nongap RF position\n"); fprintf(fp, "# Number of sequences: %d\n", msa->nseq); if(use_weights) { fprintf(fp, "# IMPORTANT: Counts are weighted based on sequence weights in alignment file.\n"); } else { fprintf(fp, "# Sequence weights from alignment were ignored (if they existed).\n"); } fprintf(fp, "#\n"); fprintf(fp, "# %8s %10s %8s %8s\n", "rfpos", "nseq w/ins", "freq ins", "avg len"); fprintf(fp, "# %8s %10s %8s %8s\n", "--------", "----------", "--------", "--------"); rflen = 0; for(apos = 1; apos <= msa->alen; apos++) if(i_am_rf[apos-1]) rflen++; rfpos = 0; for(apos = 1; apos <= msa->alen; apos++) { if(i_am_rf[apos-1]) rfpos++; else { for(i = 0; i < msa->nseq; i++) { seqwt = use_weights ? msa->wgt[i] : 1.0; if(esl_abc_XIsResidue(msa->abc, msa->ax[i][apos])) { ict[rfpos][i]++; total_ict[rfpos] += seqwt; } } } } rflen = rfpos; for(rfpos = 0; rfpos <= rflen; rfpos++) { nseq = 0.; for(i = 0; i < msa->nseq; i++) { if(ict[rfpos][i] >= 1) { seqwt = use_weights ? msa->wgt[i] : 1.0; nseq += seqwt; } } if(nseq > 0.) fprintf(fp, " %8d %10.1f %8.6f %8.3f\n", rfpos, nseq, nseq / (float) msa->nseq, ((float) total_ict[rfpos] / (float) nseq)); } fprintf(fp, "//\n"); for(i = 0; i <= msa->alen; i++) free(ict[i]); free(ict); free(total_ict); return eslOK; ERROR: return status; }
/* dump_infocontent_info * * Given an MSA with RF annotation, dump information content per column data to * an open output file. */ static int dump_infocontent_info(FILE *fp, ESL_ALPHABET *abc, double **abc_ct, int use_weights, int nali, int64_t alen, int nseq, int *i_am_rf, char *msa_name, char *alifile, char *errbuf) { int status; int apos, rfpos; double bg_ent; double *bg = NULL; double *abc_freq = NULL; double nnongap; ESL_ALLOC(bg, sizeof(double) * abc->K); esl_vec_DSet(bg, abc->K, 1./(abc->K)); bg_ent = esl_vec_DEntropy(bg, abc->K); free(bg); ESL_ALLOC(abc_freq, sizeof(double) * abc->K); fprintf(fp, "# Information content per column (bits):\n"); fprintf(fp, "# Alignment file: %s\n", alifile); fprintf(fp, "# Alignment idx: %d\n", nali); if(msa_name != NULL) { fprintf(fp, "# Alignment name: %s\n", msa_name); } fprintf(fp, "# Number of sequences: %d\n", nseq); if(use_weights) { fprintf(fp, "# IMPORTANT: Counts are weighted based on sequence weights in alignment file.\n"); } else { fprintf(fp, "# Sequence weights from alignment were ignored (if they existed).\n"); } fprintf(fp, "#\n"); if(i_am_rf != NULL) { fprintf(fp, "# %7s %7s %10s %10s\n", "rfpos", "alnpos", "freqnongap", "info(bits)"); fprintf(fp, "# %7s %7s %10s %10s\n", "-------", "-------", "----------", "----------"); } else { fprintf(fp, "# %7s %10s %10s\n", "alnpos", "freqnongap", "info(bits)"); fprintf(fp, "# %7s %10s %10s\n", "-------", "----------", "----------"); } rfpos = 0; for(apos = 0; apos < alen; apos++) { if(i_am_rf != NULL) { if(i_am_rf[apos]) { fprintf(fp, " %7d", rfpos+1); rfpos++; } else { fprintf(fp, " %7s", "-"); } } nnongap = esl_vec_DSum(abc_ct[apos], abc->K); esl_vec_DCopy(abc_ct[apos], abc->K, abc_freq); esl_vec_DNorm(abc_freq, abc->K); fprintf(fp, " %7d %10.8f %10.8f\n", apos+1, nnongap / (nnongap + abc_ct[apos][abc->K]), (bg_ent - esl_vec_DEntropy(abc_freq, abc->K))); } fprintf(fp, "//\n"); if(abc_freq != NULL) free(abc_freq); return eslOK; ERROR: ESL_FAIL(eslEINVAL, errbuf, "out of memory"); return status; /* NEVERREACHED */ }
/* count_msa() * * Given an msa, count residues, and optionally base pairs and * posterior probabilities per column and store them in <ret_abc_ct> * and <ret_pp_ct>. * * <ret_abc_ct> [0..apos..alen-1][0..abc->K]: * - per position count of each symbol in alphabet over all seqs. * * <ret_bp_ct> [0..apos..alen-1][0..abc->Kp-1][0..abc->Kp-1] * - per (non-pknotted) consensus basepair count of each possible basepair * over all seqs basepairs are indexed by 'i' the minimum of 'i:j' for a * pair between i and j, where i < j. Note that non-canonicals and * gaps and the like are all stored independently. * * <ret_pp_ct> [0..apos..alen-1][0..11] * - per position count of each posterior probability code over all seqs. * * A 'gap' has a looser definition than in esl_abc here, esl_abc's gap, * missing residues and nonresidues are all considered 'gaps' here. * * If we encounter an error, we return non-eslOK status and fill * errbuf with error message. * * Returns eslOK upon success. */ static int count_msa(ESL_MSA *msa, char *errbuf, int nali, int no_ambig, int use_weights, double ***ret_abc_ct, double ****ret_bp_ct, double ***ret_pp_ct) { int status; double **abc_ct = NULL; double ***bp_ct = NULL; int apos, rpos, i, x; int nppvals = 12; /* '0'-'9' = 0-9, '*' = 10, gap = '11' */ double **pp_ct = NULL; /* [0..alen-1][0..nppvals-1] per position count of each possible PP char over all seqs */ int ppidx; /* variables related to getting bp counts */ int *ct = NULL; /* 0..alen-1 base pair partners array for current sequence */ char *ss_nopseudo = NULL; /* no-pseudoknot version of structure */ double seqwt; /* weight of current sequence, always 1.0 if !use_weights */ if(! (msa->flags & eslMSA_DIGITAL)) ESL_FAIL(eslEINVAL, errbuf, "count_msa() contract violation, MSA is not digitized"); if(use_weights && msa->wgt == NULL) ESL_FAIL(eslEINCOMPAT, errbuf, "count_msa(): use_weights==TRUE but msa->wgt == NULL"); /* allocate pp_ct array, if nec */ if(ret_pp_ct != NULL) { if(msa->pp == NULL) ESL_FAIL(eslEINVAL, errbuf, "count_msa() ret_pp_ct != NULL, but msa->pp is NULL"); ESL_ALLOC(pp_ct, sizeof(double *) * msa->alen); for(apos = 0; apos < msa->alen; apos++) { ESL_ALLOC(pp_ct[apos], sizeof(double) * nppvals); esl_vec_DSet(pp_ct[apos], nppvals, 0.); } } /* allocate and initialize bp_ct, if nec */ if(ret_bp_ct != NULL) { ESL_ALLOC(bp_ct, sizeof(double **) * msa->alen); /* get ct array which defines the consensus base pairs */ ESL_ALLOC(ct, sizeof(int) * (msa->alen+1)); ESL_ALLOC(ss_nopseudo, sizeof(char) * (msa->alen+1)); esl_wuss_nopseudo(msa->ss_cons, ss_nopseudo); if ((status = esl_wuss2ct(ss_nopseudo, msa->alen, ct)) != eslOK) ESL_FAIL(status, errbuf, "Consensus structure string is inconsistent."); for(apos = 0; apos < msa->alen; apos++) { /* careful ct is indexed 1..alen, not 0..alen-1 */ if(ct[(apos+1)] > (apos+1)) { /* apos+1 is an 'i' in an i:j pair, where i < j */ ESL_ALLOC(bp_ct[apos], sizeof(double *) * (msa->abc->Kp)); for(x = 0; x < msa->abc->Kp; x++) { ESL_ALLOC(bp_ct[apos][x], sizeof(double) * (msa->abc->Kp)); esl_vec_DSet(bp_ct[apos][x], msa->abc->Kp, 0.); } } else { /* apos+1 is not an 'i' in an i:j pair, where i < j, set to NULL */ bp_ct[apos] = NULL; } } } ESL_ALLOC(abc_ct, sizeof(double *) * msa->alen); for(apos = 0; apos < msa->alen; apos++) { ESL_ALLOC(abc_ct[apos], sizeof(double) * (msa->abc->K+1)); esl_vec_DSet(abc_ct[apos], (msa->abc->K+1), 0.); } for(i = 0; i < msa->nseq; i++) { seqwt = use_weights ? msa->wgt[i] : 1.0; for(apos = 0; apos < msa->alen; apos++) { /* update appropriate abc count, careful, ax ranges from 1..msa->alen (but abc_ct is 0..msa->alen-1) */ if((! no_ambig) || (! esl_abc_XIsDegenerate(msa->abc, msa->ax[i][apos+1]))) { /* skip ambiguities (degenerate residues) if no_ambig is TRUE */ if((status = esl_abc_DCount(msa->abc, abc_ct[apos], msa->ax[i][apos+1], seqwt)) != eslOK) ESL_FAIL(status, errbuf, "problem counting residue %d of seq %d", apos, i); } } /* get bp counts, if nec */ if(bp_ct != NULL) { for(apos = 0; apos < msa->alen; apos++) { /* update appropriate abc count, careful, ax ranges from 1..msa->alen (but abc_ct is 0..msa->alen-1) */ if(bp_ct[apos] != NULL) { /* our flag for whether position (apos+1) is an 'i' in an i:j pair where i < j */ rpos = ct[apos+1] - 1; /* ct is indexed 1..alen */ bp_ct[apos][msa->ax[i][apos+1]][msa->ax[i][rpos+1]] += seqwt; } } } /* get PP counts, if nec */ if(pp_ct != NULL) { if(msa->pp[i] != NULL) { for(apos = 0; apos < msa->alen; apos++) { if((! no_ambig) || (! esl_abc_XIsDegenerate(msa->abc, msa->ax[i][apos+1]))) { /* skip ambiguities (degenerate residues) if no_ambig is TRUE */ if((ppidx = get_pp_idx(msa->abc, msa->pp[i][apos])) == -1) ESL_FAIL(eslEFORMAT, errbuf, "bad #=GR PP char: %c", msa->pp[i][apos]); pp_ct[apos][ppidx] += seqwt; } } } } } *ret_abc_ct = abc_ct; if(ret_bp_ct != NULL) *ret_bp_ct = bp_ct; /* we only allocated bp_ct if ret_bp_ct != NULL */ if(ret_pp_ct != NULL) *ret_pp_ct = pp_ct; /* we only allocated pp_ct if ret_pp_ct != NULL */ if(ss_nopseudo != NULL) free(ss_nopseudo); if(ct != NULL) free(ct); return eslOK; ERROR: if(abc_ct != NULL) esl_Free2D((void **) abc_ct, msa->alen); if(bp_ct != NULL) esl_Free3D((void ***) bp_ct, msa->alen, msa->abc->Kp); if(pp_ct != NULL) esl_Free2D((void **) pp_ct, msa->alen); ESL_FAIL(status, errbuf, "Error, out of memory while counting important values in the msa."); return status; /* NEVERREACHED */ }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; char *seqfile = NULL; ESL_SQFILE *sqfp = NULL; int infmt = eslSQFILE_UNKNOWN; int alphatype = eslUNKNOWN; ESL_ALPHABET *abc = NULL; ESL_SQ *sq = NULL; int64_t nseq = 0; int64_t nres = 0; int64_t small = 0; int64_t large = 0; double *monoc = NULL; /* monoresidue composition per sequence */ double *monoc_all = NULL; /* monoresidue composition over all seqs */ int do_comp = FALSE; int status = eslOK; int wstatus; int i; int x; /* Parse command line */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) cmdline_failure(argv[0], "Error in app configuration: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h") ) cmdline_help(argv[0], go); if (esl_opt_ArgNumber(go) != 1) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n"); seqfile = esl_opt_GetArg(go, 1); do_comp = esl_opt_GetBoolean(go, "-c"); if (esl_opt_GetString(go, "--informat") != NULL) { infmt = esl_sqio_FormatCode(esl_opt_GetString(go, "--informat")); if (infmt == eslSQFILE_UNKNOWN) esl_fatal("%s is not a valid input sequence file format for --informat"); } status = esl_sqfile_Open(seqfile, infmt, NULL, &sqfp); if (status == eslENOTFOUND) esl_fatal("No such file %s", seqfile); else if (status == eslEFORMAT) esl_fatal("Format of seqfile %s unrecognized.", seqfile); else if (status != eslOK) esl_fatal("Open failed, code %d.", status); if (esl_opt_GetBoolean(go, "--rna")) alphatype = eslRNA; else if (esl_opt_GetBoolean(go, "--dna")) alphatype = eslDNA; else if (esl_opt_GetBoolean(go, "--amino")) alphatype = eslAMINO; else { status = esl_sqfile_GuessAlphabet(sqfp, &alphatype); if (status == eslEAMBIGUOUS) esl_fatal("Couldn't guess alphabet from first sequence in %s", seqfile); else if (status == eslEFORMAT) esl_fatal("Sequence file parse error, line %d of file %s:\n%s\n", sqfp->linenumber, seqfile, sqfp->errbuf); else if (status == eslENODATA) esl_fatal("Sequence file %s contains no data?", seqfile); else if (status != eslOK) esl_fatal("Failed to guess alphabet (error code %d)\n", status); } abc = esl_alphabet_Create(alphatype); sq = esl_sq_CreateDigital(abc); esl_sqfile_SetDigital(sqfp, abc); if (do_comp) { ESL_ALLOC(monoc, (abc->Kp) * sizeof(double)); ESL_ALLOC(monoc_all, (abc->Kp) * sizeof(double)); esl_vec_DSet(monoc_all, abc->Kp, 0.0); esl_vec_DSet(monoc, abc->Kp, 0.0); } while ((wstatus = esl_sqio_ReadWindow(sqfp, 0, 4096, sq)) != eslEOF) { if (wstatus == eslOK) { if (do_comp) for (i = 1; i <= sq->n; i++) monoc[sq->dsq[i]]++; } else if (wstatus == eslEOD) { if (nseq == 0) { small = large = sq->L; } else { small = ESL_MIN(small, sq->L); large = ESL_MAX(large, sq->L); } if (esl_opt_GetBoolean(go, "-a")) { printf("= %-20s %8" PRId64 " %s\n", sq->name, sq->L, (sq->desc != NULL) ? sq->desc : ""); } nres += sq->L; nseq++; esl_sq_Reuse(sq); if (do_comp) { esl_vec_DAdd(monoc_all, monoc, abc->Kp); esl_vec_DSet(monoc, abc->Kp, 0.0); } } else if (wstatus == eslEFORMAT) { esl_fatal("Failed to parse sequence at line %ld, file %s:\n%s", (long) sqfp->linenumber, sqfp->filename, sqfp->errbuf); } else esl_fatal("Failed in reading sequence:\n%s\n", sqfp->errbuf); } printf("Format: %s\n", esl_sqio_DescribeFormat(sqfp->format)); printf("Alphabet type: %s\n", esl_abc_DescribeType(abc->type)); printf("Number of sequences: %" PRId64 "\n", nseq); printf("Total # residues: %" PRId64 "\n", nres); printf("Smallest: %" PRId64 "\n", small); printf("Largest: %" PRId64 "\n", large); printf("Average length: %.1f\n", (float) nres / (float) nseq); if (do_comp) { printf("\nResidue composition:\n"); for (x = 0; x < abc->Kp; x++) if (x < abc->K || monoc_all[x] > 0) printf("residue: %c %10.0f %.4f\n", abc->sym[x], monoc_all[x], monoc_all[x] / (double) nres); free(monoc); free(monoc_all); } esl_alphabet_Destroy(abc); esl_sq_Destroy(sq); esl_sqfile_Close(sqfp); esl_getopts_Destroy(go); return 0; ERROR: return status; }
/* Function: p7_prior_CreateNucleicNew() * Incept: TJW, Thu Nov 12 21:15:11 EST 2009 [Couch at home] * * Purpose: Creates the default mixture Dirichlet prior for nucleotiden * sequences. * * The transition priors (match, insert, delete) are all * single Dirichlets, originally trained by Graeme * Mitchison in the mid-1990's. Notes have been lost, but * we believe they were trained on an early version of * Pfam. * * The match emission prior is an eight-component mixture * trained against a portion of the rmark dataset * * The insert emission prior is a single Dirichlet with * high $|\alpha|$, such that insert emission probabilities * are essentially fixed by the prior, regardless of * observed count data. * * Returns: a pointer to the new <P7_PRIOR> structure. */ P7_PRIOR * p7_prior_CreateNucleic(void) { int status; P7_PRIOR *pri = NULL; int q; /* Match emission priors are trained on rmark database [Nawrocki 08] */ /* Plus-1 Laplace prior int num_comp = 1; static double defmq[2] = { 1.0 }; static double defm[1][4] = { { 1.0, 1.0, 1.0, 1.0} // }; */ /* int num_comp = 2; static double defmq[2] = { 0.42, 0.58 }; static double defm[2][4] = { { 0.94, 0.90, 0.89, 1.13}, // { 0.096, 0.078, 0.093, 0.089} // }; */ /* //weird - but this performs marginally better than the best 2- 5- or 8-component mixtures tested // (on rmark - MER: 2 better than 5/8-comp , 3 better than 2-comp ) int num_comp = 4; static double defmq[4] = { 0.16, 0.29, 0.12, 0.43 }; static double defm[4][4] = { { 0.36, 0.10, 5.3, 0.13}, // G { 0.05, 0.18, 0.03, 0.19}, // CT { 7.1, 0.13, 0.35, 0.17}, // A { 0.96, 0.92, 0.91, 1.19} // uniform }; */ /*On rmark, this model does only slightly better than the 2-component model It's chosen as the default on grounds of reasonableness, given that it shows a non-uniform transition:transversion ratio. It's based on the results of training against a portion of rmark, but the overspecified numbers resulting from that training have been rounded/simplified. */ int num_comp = 5; static double defmq[5] = { 0.16, 0.13, 0.17, 0.15, 0.39 }; static double defm[5][4] = { { 6.0, 0.2, 0.5, 0.2}, // A { 0.2, 8.0, 0.2, 0.5}, // C { 0.5, 0.2, 8.0, 0.2}, // G { 0.2, 0.5, 0.2, 4.0}, // T { 1.3, 1.2, 1.2, 1.4} // uniform }; /* gives no improved performance in my hands over the 5-component model int num_comp = 8; static double defmq[8] = { 0.13, 0.08, 0.08, 0.13, 0.08, 0.08, 0.17, 0.25 } ; static double defm[8][4] = { { 4.0, 0.3, 0.5, 0.4}, // A { 0.3, 22.0, 0.3, 0.8}, // C { 1.0, 0.4, 28.0, 0.4}, // G { 0.5, 0.8, 0.3, 6.0}, // T { 1.8, 0.8, 6.0, 1.0}, // AG { 0.6, 6.0, 0.6, 2.4}, // CT { 0.03, 0.01, 0.02, 0.02}, // anything, but highly conserved { 2.0, 2.0, 2.0, 2.0} // anything, not much conservation }; */ ESL_ALLOC(pri, sizeof(P7_PRIOR)); pri->tm = pri->ti = pri->td = pri->em = pri->ei = NULL; pri->tm = esl_mixdchlet_Create(1, 3); // match transitions; single component; 3 params pri->ti = esl_mixdchlet_Create(1, 2); // insert transitions; single component; 2 params pri->td = esl_mixdchlet_Create(1, 2); // delete transitions; single component; 2 params pri->em = esl_mixdchlet_Create(num_comp, 4); // match emissions; X component; 4 params pri->ei = esl_mixdchlet_Create(1, 4); // insert emissions; single component; 4 params if (pri->tm == NULL || pri->ti == NULL || pri->td == NULL || pri->em == NULL || pri->ei == NULL) goto ERROR; /* Transition priors: roughly, learned from rmark benchmark - hand-beautified (trimming overspecified significant digits) */ pri->tm->pq[0] = 1.0; pri->tm->alpha[0][0] = 2.0; // TMM pri->tm->alpha[0][1] = 0.1; // TMI pri->tm->alpha[0][2] = 0.1; // TMD pri->ti->pq[0] = 1.0; pri->ti->alpha[0][0] = 0.06; // TIM pri->ti->alpha[0][1] = 0.2; // TII pri->td->pq[0] = 1.0; pri->td->alpha[0][0] = 0.1; // TDM pri->td->alpha[0][1] = 0.2; // TDD /* Match emission priors */ for (q = 0; q < num_comp; q++) { pri->em->pq[q] = defmq[q]; esl_vec_DCopy(defm[q], 4, pri->em->alpha[q]); } /* Insert emission priors. Should that alphas be lower? higher? */ pri->ei->pq[0] = 1.0; esl_vec_DSet(pri->ei->alpha[0], 4, 1.0); return pri; ERROR: if (pri != NULL) p7_prior_Destroy(pri); return NULL; }
int main(int argc, char **argv) { int i,j; ESL_GETOPTS *go = NULL; /* command line processing */ ESL_STOPWATCH *w = esl_stopwatch_Create(); int status; ESL_MSA *msa = NULL; FILE *ofp = NULL; /* output file (default is stdout) */ ESL_ALPHABET *abc = NULL; /* digital alphabet */ char *alifile; /* name of the alignment file we're building HMMs from */ ESLX_MSAFILE *afp = NULL; /* open alifile */ int infmt = eslMSAFILE_UNKNOWN; /* autodetect alignment format by default. */ int outfmt = eslMSAFILE_STOCKHOLM; char *postmsafile; /* optional file to resave annotated, modified MSAs to */ FILE *postmsafp = NULL; /* open <postmsafile>, or NULL */ int mask_range_cnt = 0; uint32_t mask_starts[100]; // over-the-top allocation. uint32_t mask_ends[100]; char *rangestr; char *range; int *map = NULL; /* map[i]=j, means model position i comes from column j of the alignment; 1..alen */ int keep_mm; /* Set processor specific flags */ impl_Init(); alifile = NULL; postmsafile = NULL; /* Parse the command line */ process_commandline(argc, argv, &go, &alifile, &postmsafile); keep_mm = esl_opt_IsUsed(go, "--apendmask"); /* Initialize what we can in the config structure (without knowing the alphabet yet). * Fields controlled by masters are set up in usual_master() or mpi_master() * Fields used by workers are set up in mpi_worker() */ ofp = NULL; infmt = eslMSAFILE_UNKNOWN; afp = NULL; abc = NULL; if (esl_opt_IsOn(go, "--informat")) { infmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--informat")); if (infmt == eslMSAFILE_UNKNOWN) p7_Fail("%s is not a recognized input sequence file format\n", esl_opt_GetString(go, "--informat")); } /* Determine output alignment file format */ outfmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--outformat")); if (outfmt == eslMSAFILE_UNKNOWN) p7_Fail(argv[0], "%s is not a recognized output MSA file format\n", esl_opt_GetString(go, "--outformat")); /* Parse the ranges */ if (esl_opt_IsUsed(go, "--alirange")) { esl_strdup(esl_opt_GetString(go, "--alirange"), -1, &rangestr) ; } else if (esl_opt_IsUsed(go, "--modelrange")) { esl_strdup(esl_opt_GetString(go, "--modelrange"), -1, &rangestr) ; } else if (esl_opt_IsUsed(go, "--model2ali")) { esl_strdup(esl_opt_GetString(go, "--model2ali"), -1, &rangestr) ; } else if (esl_opt_IsUsed(go, "--ali2model")) { esl_strdup(esl_opt_GetString(go, "--ali2model"), -1, &rangestr) ; } else { if (puts("Must specify mask range with --modelrange, --alirange, --model2ali, or --ali2model\n") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto ERROR; } while ( (status = esl_strtok(&rangestr, ",", &range) ) == eslOK) { status = esl_regexp_ParseCoordString(range, mask_starts + mask_range_cnt, mask_ends + mask_range_cnt ); if (status == eslESYNTAX) esl_fatal("range flags take coords <from>..<to>; %s not recognized", range); if (status == eslFAIL) esl_fatal("Failed to find <from> or <to> coord in %s", range); mask_range_cnt++; } /* Start timing. */ esl_stopwatch_Start(w); /* Open files, set alphabet. * afp - open alignment file for input * abc - alphabet expected or guessed in ali file * postmsafp - open MSA output file * ofp - optional open output file, or stdout */ if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO); else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA); else abc = NULL; status = eslx_msafile_Open(&abc, alifile, NULL, infmt, NULL, &afp); if (status != eslOK) eslx_msafile_OpenFailure(afp, status); if (esl_opt_IsUsed(go, "--alirange") || esl_opt_IsUsed(go, "--modelrange") ) { postmsafp = fopen(postmsafile, "w"); if (postmsafp == NULL) p7_Fail("Failed to MSA output file %s for writing", postmsafile); } if (esl_opt_IsUsed(go, "-o")) { ofp = fopen(esl_opt_GetString(go, "-o"), "w"); if (ofp == NULL) p7_Fail("Failed to open -o output file %s\n", esl_opt_GetString(go, "-o")); } else ofp = stdout; /* Looks like the i/o is set up successfully... * Initial output to the user */ output_header(go, ofp, alifile, postmsafile); /* cheery output header */ /* read the alignment */ if ((status = eslx_msafile_Read(afp, &msa)) != eslOK) eslx_msafile_ReadFailure(afp, status); if (esl_opt_IsUsed(go, "--alirange") || esl_opt_IsUsed(go, "--modelrange") ) { /* add/modify mmline for the mask */ if (msa->mm == NULL) { ESL_ALLOC(msa->mm, msa->alen); keep_mm = FALSE; } if (!keep_mm) for (i=0; i<msa->alen; i++) msa->mm[i] = '.'; } // convert model coordinates to alignment coordinates, if necessary if (esl_opt_IsUsed(go, "--modelrange") || esl_opt_IsUsed(go, "--model2ali") || esl_opt_IsUsed(go, "--ali2model") ) { float symfrac = esl_opt_GetReal(go, "--symfrac"); int do_hand = esl_opt_IsOn(go, "--hand"); int L; //same as p7_builder relative_weights if (esl_opt_IsOn(go, "--wnone") ) { esl_vec_DSet(msa->wgt, msa->nseq, 1.); } else if (esl_opt_IsOn(go, "--wgiven") ) ; else if (esl_opt_IsOn(go, "--wpb") ) status = esl_msaweight_PB(msa); else if (esl_opt_IsOn(go, "--wgsc") ) status = esl_msaweight_GSC(msa); else if (esl_opt_IsOn(go, "--wblosum")) status = esl_msaweight_BLOSUM(msa, esl_opt_GetReal(go, "--wid")); if ((status = esl_msa_MarkFragments(msa, esl_opt_GetReal(go, "--fragthresh"))) != eslOK) goto ERROR; //build a map of model mask coordinates to alignment coords ESL_ALLOC(map, sizeof(int) * (msa->alen+1)); L = p7_Alimask_MakeModel2AliMap(msa, do_hand, symfrac, map ); if ( esl_opt_IsUsed(go, "--model2ali") ) { //print mapping printf ("model coordinates alignment coordinates\n"); for (i=0; i<mask_range_cnt; i++) printf ("%8d..%-8d -> %8d..%-8d\n", mask_starts[i], mask_ends[i], map[mask_starts[i]-1], map[mask_ends[i]-1]); /* If I wanted to, I could print all the map values independently: printf("\n\n-----------\n"); printf("Map\n"); printf("---\n"); for (i=0; i<L; i++) printf("%d -> %d\n", i+1, map[i]); */ } else if ( esl_opt_IsUsed(go, "--ali2model") ) { //print mapping (requires scanning the inverted map int alistart = 0; int aliend = 0; printf ("alignment coordinates model coordinates\n"); for (i=0; i<mask_range_cnt; i++) { //find j for ali positions while (map[alistart] < mask_starts[i] ) alistart++; aliend = alistart; while (map[aliend] < mask_ends[i] ) aliend++; printf (" %8d..%-8d -> %8d..%-8d\n", map[alistart], map[aliend], alistart+1, aliend+1); } } else { //convert the mask coords based on map for (i=0; i<mask_range_cnt; i++) { mask_starts[i] = map[mask_starts[i]-1]; //-1 because mmline is offset by one relative to the 1-base alignment mask_ends[i] = map[mask_ends[i]-1]; } } } if (esl_opt_IsUsed(go, "--alirange") || esl_opt_IsUsed(go, "--modelrange") ) { //overwrite '.' with 'm' everywhere the range says to do it for (i=0; i<mask_range_cnt; i++) for (j=mask_starts[i]; j<=mask_ends[i]; j++) msa->mm[j-1] = 'm'; if ((status = eslx_msafile_Write(postmsafp, msa, outfmt)) != eslOK) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); } esl_stopwatch_Stop(w); if (esl_opt_IsOn(go, "-o")) fclose(ofp); if (postmsafp) fclose(postmsafp); if (afp) eslx_msafile_Close(afp); if (abc) esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); esl_stopwatch_Destroy(w); return 0; ERROR: return eslFAIL; }
/* annotate_posterior_probability() * Synopsis: Add posterior probability annotation lines to new MSA. */ static int annotate_posterior_probability(ESL_MSA *msa, P7_TRACE **tr, const int *matmap, int M, int optflags) { double *totp = NULL; /* total posterior probability in column <apos>: [0..alen-1] */ int *matuse = NULL; /* #seqs with pp annotation in column <apos>: [0..alen-1] */ int idx; /* counter over sequences [0..nseq-1] */ int apos; /* counter for alignment columns: pp's are [0..alen-1] (unlike ax) */ int z; /* counter over trace positions [0..tr->N-1] */ int status; /* Determine if any of the traces have posterior probability annotation. */ for (idx = 0; idx < msa->nseq; idx++) if (tr[idx]->pp != NULL) break; if (idx == msa->nseq) return eslOK; ESL_ALLOC(matuse, sizeof(double) * (msa->alen)); esl_vec_ISet(matuse, msa->alen, 0); ESL_ALLOC(totp, sizeof(double) * (msa->alen)); esl_vec_DSet(totp, msa->alen, 0.0); ESL_ALLOC(msa->pp, sizeof(char *) * msa->sqalloc); for (idx = 0; idx < msa->nseq; idx++) { if (tr[idx]->pp == NULL) { msa->pp[idx] = NULL; continue; } ESL_ALLOC(msa->pp[idx], sizeof(char) * (msa->alen+1)); for (apos = 0; apos < msa->alen; apos++) msa->pp[idx][apos] = '.'; msa->pp[idx][msa->alen] = '\0'; apos = 0; for (z = 0; z < tr[idx]->N; z++) { switch (tr[idx]->st[z]) { case p7T_M: msa->pp[idx][matmap[tr[idx]->k[z]]-1] = p7_alidisplay_EncodePostProb(tr[idx]->pp[z]); totp [matmap[tr[idx]->k[z]]-1]+= tr[idx]->pp[z]; matuse[matmap[tr[idx]->k[z]]-1]++; case p7T_D: apos = matmap[tr[idx]->k[z]]; break; case p7T_I: if ( !(optflags & p7_TRIM) || (tr[idx]->k[z] != 0 && tr[idx]->k[z] != M)) { msa->pp[idx][apos] = p7_alidisplay_EncodePostProb(tr[idx]->pp[z]); apos++; } break; case p7T_N: case p7T_C: if (! (optflags & p7_TRIM) && tr[idx]->i[z] > 0) { msa->pp[idx][apos] = p7_alidisplay_EncodePostProb(tr[idx]->pp[z]); apos++; } break; case p7T_E: apos = matmap[M]; /* set position for C-terminal tail */ break; default: break; } } } for (; idx < msa->sqalloc; idx++) msa->pp[idx] = NULL; /* for completeness, following easel MSA conventions, but should be a no-op: nseq==sqalloc */ /* Consensus posterior probability annotation: only on match columns */ ESL_ALLOC(msa->pp_cons, sizeof(char) * (msa->alen+1)); for (apos = 0; apos < msa->alen; apos++) msa->pp_cons[apos] = '.'; msa->pp_cons[msa->alen] = '\0'; for (apos = 0; apos < msa->alen; apos++) if (matuse[apos]) msa->pp_cons[apos] = p7_alidisplay_EncodePostProb( totp[apos] / (double) matuse[apos]); free(matuse); free(totp); return eslOK; ERROR: if (matuse != NULL) free(matuse); if (totp != NULL) free(totp); if (msa->pp != NULL) esl_Free2D((void **) msa->pp, msa->sqalloc); return status; }