/* The gradient of the NLL w.r.t. each free parameter in p. */ static void hyperexp_complete_gradient(double *p, int np, void *dptr, double *dp) { struct hyperexp_data *data = (struct hyperexp_data *) dptr; ESL_HYPEREXP *h = data->h; double pdf; int i,k; int pidx; hyperexp_unpack_paramvector(p, np, h); esl_vec_DSet(dp, np, 0.); for (i = 0; i < data->n; i++) { /* FIXME: I think the calculation below may need to be done * in log space, to avoid underflow errors; see complete_binned_gradient() */ /* Precalculate q_k PDF_k(x) terms, and their sum */ for (k = 0; k < h->K; k++) h->wrk[k] = h->q[k] * esl_exp_pdf(data->x[i], h->mu, h->lambda[k]); pdf = esl_vec_DSum(h->wrk, h->K); pidx = 0; if (! h->fixmix) { for (k = 1; k < h->K; k++) /* generic d/dQ solution for mixture models */ dp[pidx++] -= h->wrk[k]/pdf - h->q[k]; } for (k = 0; k < h->K; k++) if (! h->fixlambda[k]) dp[pidx++] -= (1.-h->lambda[k]*(data->x[i]-h->mu))*h->wrk[k]/pdf; /* d/dw */ } }
/* Function: esl_rmx_E2Q() * Incept: SRE, Tue Jul 13 15:52:41 2004 [St. Louis] * * Purpose: Given a lower triangular matrix ($j<i$) of * residue exchangeabilities <E>, and a stationary residue * frequency vector <pi>; assuming $E_{ij} = E_{ji}$; * calculates a rate matrix <Q> as * * $Q_{ij} = E_{ij} * \pi_j$ * * The resulting <Q> is not normalized to any particular * number of substitutions/site/time unit. See * <esl_rmx_ScaleTo()> for that. * * Args: E - symmetric residue "exchangeabilities"; * only lower triangular entries are used. * pi - residue frequencies at stationarity. * Q - RETURN: rate matrix, square (NxN). * Caller allocates the memory for this. * * Returns: <eslOK> on success; Q is calculated and filled in. * * Xref: STL8/p56. */ int esl_rmx_E2Q(ESL_DMATRIX *E, double *pi, ESL_DMATRIX *Q) { int i,j; if (E->n != Q->n) ESL_EXCEPTION(eslEINVAL, "E and Q sizes differ"); /* Scale all off-diagonals to pi[j] * E[i][j]. */ for (i = 0; i < E->n; i++) for (j = 0; j < i; j++) /* only look at lower triangle of E. */ { Q->mx[i][j] = pi[j] * E->mx[i][j]; Q->mx[j][i] = pi[i] * E->mx[i][j]; } /* Set diagonal to -\sum of all j != i. */ for (i = 0; i < Q->n; i++) { Q->mx[i][i] = 0.; /* makes the vector sum work for j != i */ Q->mx[i][i] = -1. * esl_vec_DSum(Q->mx[i], Q->n); } return eslOK; }
/* Function: esl_rmx_SetWAG() * Incept: SRE, Thu Mar 8 18:00:00 2007 [Janelia] * * Purpose: Sets a $20 \times 20$ rate matrix <Q> to WAG parameters. * The caller allocated <Q>. * * If <pi> is non-<NULL>, it provides a vector of 20 amino * acid stationary probabilities in Easel alphabetic order, * A..Y, and the WAG stationary probabilities are set to * these desired $\pi_i$. If <pi> is <NULL>, the default * WAG stationary probabilities are used. * * The WAG parameters are a maximum likelihood * parameterization obtained by Whelan and Goldman * \citep{WhelanGoldman01}. * * Note: The data table was reformatted from wag.dat by the UTILITY1 * executable in the paml module. The wag.dat file was obtained from * \url{http://www.ebi.ac.uk/goldman/WAG/wag.dat}. A copy * is in formats/wag.dat. * * Args: Q - a 20x20 rate matrix to set, allocated by caller. * pi - desired stationary probabilities A..Y, or * NULL to use WAG defaults. * * Returns: <eslOK> on success. * * Throws: <eslEINVAL> if <Q> isn't a 20x20 general matrix; and * the state of <Q> is undefined. */ int esl_rmx_SetWAG(ESL_DMATRIX *Q, double *pi) { static double wagE[190] = { 1.027040, 0.738998, 0.030295, 1.582850, 0.021352, 6.174160, 0.210494, 0.398020, 0.046730, 0.081134, 1.416720, 0.306674, 0.865584, 0.567717, 0.049931, 0.316954, 0.248972, 0.930676, 0.570025, 0.679371, 0.249410, 0.193335, 0.170135, 0.039437, 0.127395, 1.059470, 0.030450, 0.138190, 0.906265, 0.074034, 0.479855, 2.584430, 0.088836, 0.373558, 0.890432, 0.323832, 0.397915, 0.384287, 0.084805, 0.154263, 2.115170, 0.061304, 0.499462, 3.170970, 0.257555, 0.893496, 0.390482, 0.103754, 0.315124, 1.190630, 0.174100, 0.404141, 4.257460, 0.934276, 4.854020, 0.509848, 0.265256, 5.429420, 0.947198, 0.096162, 1.125560, 3.956290, 0.554236, 3.012010, 0.131528, 0.198221, 1.438550, 0.109404, 0.423984, 0.682355, 0.161444, 0.243570, 0.696198, 0.099929, 0.556896, 0.415844, 0.171329, 0.195081, 0.908598, 0.098818, 0.616783, 5.469470, 0.099921, 0.330052, 4.294110, 0.113917, 3.894900, 0.869489, 1.545260, 1.543640, 0.933372, 0.551571, 0.528191, 0.147304, 0.439157, 0.102711, 0.584665, 2.137150, 0.186979, 5.351420, 0.497671, 0.683162, 0.635346, 0.679489, 3.035500, 3.370790, 1.407660, 1.071760, 0.704939, 0.545931, 1.341820, 0.740169, 0.319440, 0.967130, 0.344739, 0.493905, 3.974230, 1.613280, 1.028870, 1.224190, 2.121110, 0.512984, 0.374866, 0.822765, 0.171903, 0.225833, 0.473307, 1.458160, 1.386980, 0.326622, 1.516120, 2.030060, 0.795384, 0.857928, 0.554413, 4.378020, 2.006010, 1.002140, 0.152335, 0.588731, 0.649892, 0.187247, 0.118358, 7.821300, 0.305434, 1.800340, 2.058450, 0.196246, 0.314887, 0.301281, 0.251849, 0.232739, 1.388230, 0.113133, 0.717070, 0.129767, 0.156557, 1.529640, 0.336983, 0.262569, 0.212483, 0.137505, 0.665309, 0.515706, 0.071917, 0.139405, 0.215737, 1.163920, 0.523742, 0.110864, 0.365369, 0.240735, 0.543833, 0.325711, 0.196303, 6.454280, 0.103604, 3.873440, 0.420170, 0.133264, 0.398618, 0.428437, 1.086000, 0.216046, 0.227710, 0.381533, 0.786993, 0.291148, 0.314730, 2.485390}; static double wagpi[20]; int i,j,z; if (Q->m != 20 || Q->n != 20 || Q->type != eslGENERAL) ESL_EXCEPTION(eslEINVAL, "Q must be a 20x20 general matrix"); esl_composition_WAG(wagpi); /* 1. Transfer the wag E lower triagonal matrix directly into Q. */ z = 0; for (i = 0; i < 20; i++) { Q->mx[i][i] = 0.; /* code below depends on this zero initialization */ for (j = 0; j < i; j++) { Q->mx[i][j] = wagE[z++]; Q->mx[j][i] = Q->mx[i][j]; } } /* 2. Set offdiagonals Q_ij = E_ij * pi_j */ for (i = 0; i < 20; i++) for (j = 0; j < 20; j++) if (pi != NULL) Q->mx[i][j] *= pi[j]; else Q->mx[i][j] *= wagpi[j]; /* 3. Set diagonal Q_ii to -\sum_{i \neq j} Q_ij */ for (i = 0; i < 20; i++) Q->mx[i][i] = -1. * esl_vec_DSum(Q->mx[i], 20); /* 4. Renormalize matrix to units of 1 substitution/site. */ if (pi != NULL) esl_rmx_ScaleTo(Q, pi, 1.0); else esl_rmx_ScaleTo(Q, wagpi, 1.0); return eslOK; }
/* Function: esl_vec_DNorm() * Synopsis: Normalize probability vector. * * Purpose: Normalizes a probability vector <vec>, * such that $\sum_{i=1}{n} \mathrm{vec}_i = 1.0$. * * <esl_vec_FNorm()> does the same, for a probability vector * of floats. */ void esl_vec_DNorm(double *vec, int n) { int x; double sum; sum = esl_vec_DSum(vec, n); if (sum != 0.0) for (x = 0; x < n; x++) vec[x] /= sum; else for (x = 0; x < n; x++) vec[x] = 1. / (double) n; }
/* Function: esl_hyperexp_Read() * * Purpose: Reads hyperexponential parameters from an open <e>. * which is an <ESL_FILEPARSER> tokenizer for an open stream. * * The first token is <K>, the number of mixture components. * The second token is <mu>, the x offset shared by all components. * Then for each mixture component <k=1..K>, it reads * a mixture coefficient <q[k]> and a decay parameter * <lambda[k]>. * * The <2K+2> data tokens must occur in this order, but * they can be grouped into any number of lines, because the * parser ignores line breaks. * * Anything after a <\#> character on a line is a comment, and * is ignored. * * Returns: <eslOK> on success, and <ret_hxp> points to a new <ESL_HYPEREXP> * object. * <eslEFORMAT> on "normal" parse failure caused by a bad file * format that's likely the user's fault. * * Throws: <eslEMEM> if allocation of the new <ESL_HYPEREXP> fails. * * * FIXME: All our mixture models (esl_dirichlet, for example) should be * reconciled w/ identical interfaces & behaviour. */ int esl_hyperexp_Read(ESL_FILEPARSER *e, ESL_HYPEREXP **ret_hxp) { ESL_HYPEREXP *hxp = NULL; char *tok; int status = eslOK; int nc; int k; double sum; esl_fileparser_SetCommentChar(e, '#'); if ((status = esl_fileparser_GetToken(e, &tok, NULL)) != eslOK) goto ERROR; nc = atoi(tok); if (nc < 1) { sprintf(e->errbuf, "Expected # of components K >= 1 as first token"); goto ERROR; } if ((hxp = esl_hyperexp_Create(nc)) == NULL) return eslEMEM; /* percolation */ if ((status = esl_fileparser_GetToken(e, &tok, NULL)) != eslOK) goto ERROR; hxp->mu = atof(tok); for (k = 0; k < hxp->K; k++) { if ((status = esl_fileparser_GetToken(e, &tok, NULL)) != eslOK) goto ERROR; hxp->q[k] = atof(tok); if ((status = esl_fileparser_GetToken(e, &tok, NULL)) != eslOK) goto ERROR; hxp->lambda[k] = atof(tok); if (hxp->q[k] < 0. || hxp->q[k] > 1.) { sprintf(e->errbuf, "Expected a mixture coefficient q[k], 0<=q[k]<=1"); goto ERROR; } if (hxp->lambda[k] <= 0.) { sprintf(e->errbuf, "Expected a lambda parameter, lambda>0"); goto ERROR; } } sum = esl_vec_DSum(hxp->q, hxp->K); if (fabs(sum-1.0) > 0.05) { sprintf(e->errbuf, "Expected mixture coefficients to sum to 1"); goto ERROR; } esl_vec_DNorm(hxp->q, hxp->K); *ret_hxp = hxp; return eslOK; ERROR: esl_hyperexp_Destroy(hxp); return eslEFORMAT; }
/* Function: esl_rmx_SetHKY() * Incept: SRE, Thu Aug 12 08:26:39 2004 [St. Louis] * * Purpose: Given stationary base composition <pi> for ACGT, and * transition and transversion relative rates <alpha> and * <beta> respectively, sets the matrix <Q> to be the * corresponding HKY (Hasegawa/Kishino/Yano) DNA rate * matrix, scaled in units of 1t= 1.0 substitutions/site * \citep{Hasegawa85}. * * Args: pi - stationary base composition A..T * alpha - relative transition rate * beta - relative transversion rate * * * Returns: <eslOK> * * Xref: */ int esl_rmx_SetHKY( ESL_DMATRIX *Q, double *pi, double alpha, double beta) { int i,j; if (Q->m != 4 || Q->n != 4 || Q->type != eslGENERAL) ESL_EXCEPTION(eslEINVAL, "Q must be a 4x4 general matrix"); for (i = 0; i < 4; i++) { for (j = 0; j < 4; j++) { if (i != j) Q->mx[i][j] = ((i+j)%2)? pi[j]*beta : pi[j]*alpha; /* even=0=transition;odd=1=transversion */ else Q->mx[i][j] = 0.; } Q->mx[i][i] = -1. * esl_vec_DSum(Q->mx[i], 4); } esl_rmx_ScaleTo(Q, pi, 1.0); return eslOK; }
/* Function: esl_rmx_ValidateP() * Incept: SRE, Sun Mar 11 10:30:50 2007 [Janelia] * * Purpose: Validates a conditional probability matrix <P>, whose * elements $P_{ij}$ represent conditional probabilities * $P(j \mid i)$; for example in a first-order Markov * chain, or a continuous-time Markov transition process * where <P> is for a particular $t$. * * Rows must sum to one, and each element $P_{ij}$ is a * probability $0 \leq P_{ij} \leq 1$. * * <tol> specifies the floating-point tolerance to which * the row sums must equal one: <fabs(sum-1.0) <= tol>. * * <errbuf> is an optional error message buffer. The caller * may pass <NULL> or a pointer to a buffer of at least * <eslERRBUFSIZE> characters. * * Args: P - matrix to validate * tol - floating-point tolerance (0.00001, for example) * errbuf - OPTIONAL: ptr to an error buffer of at least * <eslERRBUFSIZE> characters. * * Returns: <eslOK> on successful validation. * <eslFAIL> on failure, and if a non-<NULL> <errbuf> was * provided by the caller, a message describing * the reason for the failure is put there. * * Throws: (no abnormal error conditions) */ int esl_rmx_ValidateP(ESL_DMATRIX *P, double tol, char *errbuf) { int i,j; double sum; if (P->type != eslGENERAL) ESL_EXCEPTION(eslEINVAL, "P must be type eslGENERAL to be validated"); for (i = 0; i < P->n; i++) { sum = esl_vec_DSum(P->mx[i], P->m); if (fabs(sum-1.0) > tol) ESL_FAIL(eslFAIL, errbuf, "row %d does not sum to 1.0", i); for (j = 0; j < P->m; j++) if (P->mx[i][j] < 0.0 || P->mx[i][j] > 1.0) ESL_FAIL(eslFAIL, errbuf, "element %d,%d is not a probability (%f)", i,j,P->mx[i][j]); } return eslOK; }
/* Function: esl_rmx_SetF81() * Incept: SRE, Thu Mar 15 13:33:30 2007 [Janelia] * * Purpose: Sets a 4x4 rate matrix to the F81 model (aka * equal-input model) given stationary base * compositions <pi>, * scaled to units of 1t = 1.0 substitutions/site. */ int esl_rmx_SetF81(ESL_DMATRIX *Q, double *pi) { int i,j; if (Q->m != 4 || Q->n != 4 || Q->type != eslGENERAL) ESL_EXCEPTION(eslEINVAL, "Q must be a 4x4 general matrix"); for (i = 0; i < 4; i++) { for (j = 0; j < 4; j++) { if (i != j) Q->mx[i][j] = pi[j]; else Q->mx[i][j] = 0.0; } Q->mx[i][i] = -1. * esl_vec_DSum(Q->mx[i], 4); } esl_rmx_ScaleTo(Q, pi, 1.0); return eslOK; }
/* dump_residue_info * * Given an MSA, print out the number of sequences with * a non-gap residue in each column of the alignment. */ static int dump_residue_info(FILE *fp, ESL_ALPHABET *abc, double **abc_ct, int use_weights, int nali, int64_t alen, int nseq, int *i_am_rf, char *msa_name, char *alifile, char *errbuf) { int apos, rfpos; double rct, gct; fprintf(fp, "# Insert information:\n"); fprintf(fp, "# Alignment file: %s\n", alifile); fprintf(fp, "# Alignment idx: %d\n", nali); if(msa_name != NULL) { fprintf(fp, "# Alignment name: %s\n", msa_name); } fprintf(fp, "# Number of sequences: %d\n", nseq); if(use_weights) { fprintf(fp, "# IMPORTANT: Counts are weighted based on sequence weights in alignment file.\n"); } else { fprintf(fp, "# Sequence weights from alignment were ignored (if they existed).\n"); } fprintf(fp, "#\n"); if(i_am_rf != NULL) { fprintf(fp, "# %7s %7s %10s %8s %10s %8s\n", "rfpos", "alnpos", "numres", "freqres", "numgap", "freqgap"); fprintf(fp, "# %7s %7s %10s %8s %10s %8s\n", "-------", "-------", "----------", "--------", "----------", "--------"); } else { fprintf(fp, "# %7s %10s %8s %10s %8s\n", "alnpos", "numres", "freqres", "numgap", "freqgap"); fprintf(fp, "# %7s %10s %8s %10s %8s\n", "-------", "----------", "--------", "----------", "--------"); } rfpos = 0; for(apos = 0; apos < alen; apos++) { rct = esl_vec_DSum(abc_ct[apos], abc->K); gct = abc_ct[apos][abc->K]; if(i_am_rf != NULL) { if(i_am_rf[apos]) { fprintf(fp, " %7d", rfpos+1); rfpos++; } else { fprintf(fp, " %7s", "-"); } } fprintf(fp, " %7d %10.1f %8.6f %10.1f %8.6f\n", apos+1, rct, rct / (float) nseq, gct, gct / (float) nseq); } fprintf(fp, "//\n"); return eslOK; }
/* dump_posterior_column_info * * Dump per-column posterior probability data to a file. * */ static int dump_posterior_column_info(FILE *fp, double **pp_ct, int use_weights, int nali, int64_t alen, int nseq, int *i_am_rf, char *msa_name, char *alifile, char *errbuf) { int p,apos; /* counters over sequences, columns of MSA */ int nppvals = 12; int rfpos; double nnongap; double sum; float ppavgA[11]; char ppstring[12] = "0123456789*."; ppavgA[0] = 0.025; ppavgA[1] = 0.10; ppavgA[2] = 0.20; ppavgA[3] = 0.30; ppavgA[4] = 0.40; ppavgA[5] = 0.50; ppavgA[6] = 0.60; ppavgA[7] = 0.70; ppavgA[8] = 0.80; ppavgA[9] = 0.90; ppavgA[10] = 0.975; fprintf(fp, "# Posterior probability stats per column:\n"); fprintf(fp, "# Alignment file: %s\n", alifile); fprintf(fp, "# Alignment idx: %d\n", nali); if(msa_name != NULL) { fprintf(fp, "# Alignment name: %s\n", msa_name); } fprintf(fp, "# Number of sequences: %d\n", nseq); if(use_weights) { fprintf(fp, "# IMPORTANT: Counts are weighted based on sequence weights in alignment file.\n"); } else { fprintf(fp, "# Sequence weights from alignment were ignored (if they existed).\n"); } fprintf(fp, "#\n"); fprintf(fp, "# %6s", "alnpos"); if(i_am_rf != NULL) fprintf(fp, " %6s", "rfpos"); fprintf(fp, " %9s", "nnongap"); for(p = 0; p < nppvals; p++) { fprintf(fp, " %9c", ppstring[p]); } fprintf(fp, " %9s\n", "avgPP"); fprintf(fp, "# %6s %6s %9s", "------", "------", "---------"); for(p = 0; p < nppvals; p++) { fprintf(fp, " %9s", "---------"); } fprintf(fp, " %9s\n", "---------"); rfpos = 1; for(apos = 0; apos < alen; apos++) { sum = 0; fprintf(fp, " %6d", apos+1); if(i_am_rf != NULL) { if(i_am_rf[apos]) fprintf(fp, " %6d", rfpos++); else fprintf(fp, " %6s", "-"); } nnongap = esl_vec_DSum(pp_ct[apos], 11); fprintf(fp, " %9.1f", nnongap); for(p = 0; p < nppvals; p++) { fprintf(fp, " %9.1f", pp_ct[apos][p]); if(p <= 10) sum += pp_ct[apos][p] * ppavgA[p]; } fprintf(fp, " %.5f\n", sum / nnongap); } fprintf(fp, "//\n"); return eslOK; }
/* dump_infocontent_info * * Given an MSA with RF annotation, dump information content per column data to * an open output file. */ static int dump_infocontent_info(FILE *fp, ESL_ALPHABET *abc, double **abc_ct, int use_weights, int nali, int64_t alen, int nseq, int *i_am_rf, char *msa_name, char *alifile, char *errbuf) { int status; int apos, rfpos; double bg_ent; double *bg = NULL; double *abc_freq = NULL; double nnongap; ESL_ALLOC(bg, sizeof(double) * abc->K); esl_vec_DSet(bg, abc->K, 1./(abc->K)); bg_ent = esl_vec_DEntropy(bg, abc->K); free(bg); ESL_ALLOC(abc_freq, sizeof(double) * abc->K); fprintf(fp, "# Information content per column (bits):\n"); fprintf(fp, "# Alignment file: %s\n", alifile); fprintf(fp, "# Alignment idx: %d\n", nali); if(msa_name != NULL) { fprintf(fp, "# Alignment name: %s\n", msa_name); } fprintf(fp, "# Number of sequences: %d\n", nseq); if(use_weights) { fprintf(fp, "# IMPORTANT: Counts are weighted based on sequence weights in alignment file.\n"); } else { fprintf(fp, "# Sequence weights from alignment were ignored (if they existed).\n"); } fprintf(fp, "#\n"); if(i_am_rf != NULL) { fprintf(fp, "# %7s %7s %10s %10s\n", "rfpos", "alnpos", "freqnongap", "info(bits)"); fprintf(fp, "# %7s %7s %10s %10s\n", "-------", "-------", "----------", "----------"); } else { fprintf(fp, "# %7s %10s %10s\n", "alnpos", "freqnongap", "info(bits)"); fprintf(fp, "# %7s %10s %10s\n", "-------", "----------", "----------"); } rfpos = 0; for(apos = 0; apos < alen; apos++) { if(i_am_rf != NULL) { if(i_am_rf[apos]) { fprintf(fp, " %7d", rfpos+1); rfpos++; } else { fprintf(fp, " %7s", "-"); } } nnongap = esl_vec_DSum(abc_ct[apos], abc->K); esl_vec_DCopy(abc_ct[apos], abc->K, abc_freq); esl_vec_DNorm(abc_freq, abc->K); fprintf(fp, " %7d %10.8f %10.8f\n", apos+1, nnongap / (nnongap + abc_ct[apos][abc->K]), (bg_ent - esl_vec_DEntropy(abc_freq, abc->K))); } fprintf(fp, "//\n"); if(abc_freq != NULL) free(abc_freq); return eslOK; ERROR: ESL_FAIL(eslEINVAL, errbuf, "out of memory"); return status; /* NEVERREACHED */ }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* application configuration */ ESL_ALPHABET *abc = NULL; /* biological alphabet */ char *alifile = NULL; /* alignment file name */ int fmt = eslMSAFILE_UNKNOWN; /* format code for alifile */ ESLX_MSAFILE *afp = NULL; /* open msa file */ ESL_MSAFILE2 *old_afp = NULL; /* open msa file, legacy (--small) */ ESL_MSA *msa = NULL; /* one multiple sequence alignment */ int nali; /* number of alignments read */ int i; /* counter over seqs */ int64_t alen; /* alignment length */ int nseq; /* number of sequences in the msa */ int64_t rlen; /* a raw (unaligned) seq length */ int64_t small, large; /* smallest, largest sequence */ int64_t nres; /* total # of residues in msa */ double avgid; /* average fractional pair id */ int max_comparisons; /* maximum # comparisons for avg id */ int do_stall; /* used to stall when debugging */ double **abc_ct = NULL; /* [0..msa->alen-1][0..abc->K] number of each residue at each position (abc->K is gap) */ double ***bp_ct = NULL; /* [0..msa->alen-1][0..abc->Kp-1][0..abc->Kp-1] per (non-pknotted) consensus basepair * * count of each possible basepair over all seqs basepairs are indexed by 'i' the minimum * * of 'i:j' for a pair between i and j, where i < j. */ double **pp_ct = NULL; /* [0..msa->alen-1][0..11], count of each posterior probability (PP) code, over all sequences, gap is 11 */ int *i_am_rf = NULL; /* [0..i..msa->alen-1]: TRUE if pos i is non-gap RF posn, if msa->rf == NULL remains NULL */ int *rf2a_map = NULL; /* [0..rfpos..rflen-1] = apos, * apos is the alignment position (0..msa->alen-1) that * is non-gap RF position rfpos+1 (for rfpos in 0..rflen-1) */ int rflen = -1; /* nongap RF length */ char errbuf[eslERRBUFSIZE]; int status; /* easel return code */ /* optional output files */ FILE *iinfofp = NULL; /* output file for --iinfo */ FILE *pcinfofp = NULL; /* output file for --pcinfo */ FILE *psinfofp = NULL; /* output file for --psinfo */ FILE *rinfofp = NULL; /* output file for --rinfo */ FILE *icinfofp = NULL; /* output file for --icinfo */ FILE *listfp = NULL; /* output file for --list */ FILE *cinfofp = NULL; /* output file for --cinfo */ FILE *bpinfofp = NULL; /* output file for --bpinfo */ int use_weights; /* TRUE if --weight, reported weighted counts (using msa->wgt) to all output files */ int weights_exist; /* TRUE if at least one msa->wgt value differs from 1.0, FALSE if not (or if msa->wgt==NULL) */ /*********************************************** * Parse command line ***********************************************/ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK || esl_opt_VerifyConfig(go) != eslOK) { printf("Failed to parse command line: %s\n", go->errbuf); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if (esl_opt_GetBoolean(go, "-h") ) { esl_banner(stdout, argv[0], banner); esl_usage (stdout, argv[0], usage); puts("\n where options are:"); esl_opt_DisplayHelp(stdout, go, 1, 2, 80); puts("\n small memory mode, requires --amino,--dna, or --rna and --informat pfam:"); esl_opt_DisplayHelp(stdout, go, 2, 2, 80); puts("\n optional output files:"); esl_opt_DisplayHelp(stdout, go, 3, 2, 80); exit(0); } if (esl_opt_ArgNumber(go) != 1) { printf("Incorrect number of command line arguments.\n"); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } alifile = esl_opt_GetArg(go, 1); if (esl_opt_IsOn(go, "--informat") && (fmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--informat"))) == eslMSAFILE_UNKNOWN) esl_fatal("%s is not a valid input sequence file format for --informat", esl_opt_GetString(go, "--informat")); if (esl_opt_GetBoolean(go, "--small") && fmt != eslMSAFILE_PFAM) esl_fatal("--small requires --informat pfam\n"); max_comparisons = 1000; do_stall = esl_opt_GetBoolean(go, "--stall"); /* a stall point for attaching gdb */ while (do_stall); /*********************************************** * Open the MSA file; determine alphabet; set for digital input ***********************************************/ if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO); else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA); /* We'd like to get rid of the legacy msafile interface, but it * includes small memory functionality for Pfam format which we have * to replace first. For now, use both interfaces, new and legacy */ if ( esl_opt_GetBoolean(go, "--small") ) { if (! abc) esl_fatal("--small requires one of --amino, --dna, --rna be specified."); status = esl_msafile2_OpenDigital(abc, alifile, NULL, &old_afp); if (status == eslENOTFOUND) esl_fatal("Alignment file %s doesn't exist or is not readable\n", alifile); else if (status == eslEFORMAT) esl_fatal("Couldn't determine format of alignment %s\n", alifile); else if (status != eslOK) esl_fatal("Alignment file open failed with error %d\n", status); } else { if ( (status = eslx_msafile_Open(&abc, alifile, NULL, fmt, NULL, &afp)) != eslOK) eslx_msafile_OpenFailure(afp, status); } /************************************** * Open optional output files, as nec * **************************************/ /* determine name for first list file, if nec */ if( esl_opt_IsOn(go, "--list")) { if ((listfp = fopen(esl_opt_GetString(go, "--list"), "w")) == NULL) esl_fatal("Failed to open --list output file %s\n", esl_opt_GetString(go, "--list")); } if( esl_opt_IsOn(go, "--icinfo")) { if ((icinfofp = fopen(esl_opt_GetString(go, "--icinfo"), "w")) == NULL) esl_fatal("Failed to open --icinfo output file %s\n", esl_opt_GetString(go, "--icinfo")); } if( esl_opt_IsOn(go, "--rinfo")) { if ((rinfofp = fopen(esl_opt_GetString(go, "--rinfo"), "w")) == NULL) esl_fatal("Failed to open --rinfo output file %s\n", esl_opt_GetString(go, "--rinfo")); } if( esl_opt_IsOn(go, "--pcinfo")) { if ((pcinfofp = fopen(esl_opt_GetString(go, "--pcinfo"), "w")) == NULL) esl_fatal("Failed to open --pcinfo output file %s\n", esl_opt_GetString(go, "--pcinfo")); } if( esl_opt_IsOn(go, "--psinfo")) { if ((psinfofp = fopen(esl_opt_GetString(go, "--psinfo"), "w")) == NULL) esl_fatal("Failed to open --psinfo output file %s\n", esl_opt_GetString(go, "--psinfo")); } if( esl_opt_IsOn(go, "--iinfo")) { if ((iinfofp = fopen(esl_opt_GetString(go, "--iinfo"), "w")) == NULL) esl_fatal("Failed to open --iinfo output file %s\n", esl_opt_GetString(go, "--iinfo")); } if( esl_opt_IsOn(go, "--cinfo")) { if ((cinfofp = fopen(esl_opt_GetString(go, "--cinfo"), "w")) == NULL) esl_fatal("Failed to open --cinfo output file %s\n", esl_opt_GetString(go, "--cinfo")); } if( esl_opt_IsOn(go, "--bpinfo")) { if ((bpinfofp = fopen(esl_opt_GetString(go, "--bpinfo"), "w")) == NULL) esl_fatal("Failed to open --bpinfo output file %s\n", esl_opt_GetString(go, "--bpinfo")); } /*********************************************** * Read MSAs one at a time. ***********************************************/ if (esl_opt_GetBoolean(go, "-1")) { puts("#"); if(! esl_opt_GetBoolean(go, "--small")) { printf("# %-4s %-20s %10s %7s %7s %12s %6s %6s %10s %3s\n", "idx", "name", "format", "nseq", "alen", "nres", "small", "large", "avlen", "%id"); printf("# %-4s %-20s %10s %7s %7s %12s %6s %6s %10s %3s\n", "----", "--------------------", "----------", "-------", "-------", "------------", "------", "------", "----------", "---"); } else { printf("# %-4s %-20s %10s %7s %7s %12s %10s\n", "idx", "name", "format", "nseq", "alen", "nres", "avlen"); printf("# %-4s %-20s %10s %7s %7s %12s %10s\n", "----", "--------------------", "----------", "-------", "-------", "------------", "----------"); } } nali = 0; fmt = (esl_opt_GetBoolean(go, "--small") ? old_afp->format : afp->format); while ( (status = ( esl_opt_GetBoolean(go, "--small") ? esl_msafile2_ReadInfoPfam(old_afp, listfp, abc, -1, NULL, NULL, &msa, &nseq, &alen, NULL, NULL, NULL, NULL, NULL, &abc_ct, &pp_ct, NULL, NULL, NULL) : eslx_msafile_Read (afp, &msa))) == eslOK) { nali++; nres = 0; if (! esl_opt_GetBoolean(go, "--small")) { nseq = msa->nseq; alen = msa->alen; small = large = -1; for (i = 0; i < msa->nseq; i++) { rlen = esl_abc_dsqrlen(msa->abc, msa->ax[i]); nres += rlen; if (small == -1 || rlen < small) small = rlen; if (large == -1 || rlen > large) large = rlen; } esl_dst_XAverageId(abc, msa->ax, msa->nseq, max_comparisons, &avgid); } else { /* --small invoked */ for(i = 0; i < alen; i++) nres += (int) esl_vec_DSum(abc_ct[i], abc->K); } if (esl_opt_GetBoolean(go, "-1")) { printf("%-6d %-20s %10s %7d %7" PRId64 " %12" PRId64, nali, msa->name, eslx_msafile_DecodeFormat(fmt), nseq, alen, nres); if (! esl_opt_GetBoolean(go, "--small")) { printf(" %6" PRId64 " %6" PRId64 " %10.1f %3.0f\n", small, large, (double) nres / (double) msa->nseq, 100.*avgid); } else { printf(" %10.1f\n", (double) nres / (double) nseq); } } else { printf("Alignment number: %d\n", nali); if (msa->name != NULL) printf("Alignment name: %s\n", msa->name); printf("Format: %s\n", eslx_msafile_DecodeFormat(fmt)); printf("Number of sequences: %d\n", nseq); printf("Alignment length: %" PRId64 "\n", alen); printf("Total # residues: %" PRId64 "\n", nres); if(! esl_opt_GetBoolean(go, "--small")) { printf("Smallest: %" PRId64 "\n", small); printf("Largest: %" PRId64 "\n", large); } printf("Average length: %.1f\n", (double) nres / (double) nseq); if(! esl_opt_GetBoolean(go, "--small")) { printf("Average identity: %.0f%%\n", 100.*avgid); } printf("//\n"); } /* Dump data to optional output files, if nec */ if(esl_opt_IsOn(go, "--list")) { if(! esl_opt_GetBoolean(go, "--small")) { /* only print sequence name to list file if ! --small, else we already have in esl_msafile2_ReadInfoPfam() */ for(i = 0; i < msa->nseq; i++) fprintf(listfp, "%s\n", msa->sqname[i]); } } /* if RF exists, get i_am_rf array[0..alen] which tells us which positions are non-gap RF positions * and rf2a_map, a map of non-gap RF positions to overall alignment positions */ if(msa->rf != NULL) { if((status = map_rfpos_to_apos(msa, abc, errbuf, alen, &i_am_rf, &rf2a_map, &rflen)) != eslOK) esl_fatal(errbuf); } else i_am_rf = NULL; weights_exist = check_msa_weights(msa); use_weights = (weights_exist && esl_opt_GetBoolean(go, "--weight")) ? TRUE : FALSE; if( (! esl_opt_GetBoolean(go, "--small")) && (esl_opt_IsOn(go, "--icinfo") || esl_opt_IsOn(go, "--rinfo") || esl_opt_IsOn(go, "--pcinfo") || esl_opt_IsOn(go, "--cinfo") || esl_opt_IsOn(go, "--bpinfo"))) { /* collect counts of each residue and PPs (if they exist) from the msa */ if(esl_opt_GetBoolean(go, "--weight") && msa->wgt == NULL) esl_fatal("--weight requires all alignments have #=GS WT annotation, but aln %d does not", nali); if((status = count_msa(msa, errbuf, nali, esl_opt_GetBoolean(go, "--noambig"), /* ignore ambiguous residues? */ esl_opt_GetBoolean(go, "--weight"), /* use msa->wgt sequence weights? */ &abc_ct, ((bpinfofp != NULL && msa->ss_cons != NULL) ? &bp_ct : NULL), /* get basepair counts? */ (msa->pp != NULL ? &pp_ct : NULL))) /* get PP counts? */ != eslOK) esl_fatal(errbuf); } if( esl_opt_IsOn(go, "--icinfo")) { if((status = dump_infocontent_info(icinfofp, abc, abc_ct, use_weights, nali, alen, nseq, i_am_rf, msa->name, alifile, errbuf) != eslOK)) esl_fatal(errbuf); } if( esl_opt_IsOn(go, "--rinfo")) { if((status = dump_residue_info(rinfofp, abc, abc_ct, use_weights, nali, alen, nseq, i_am_rf, msa->name, alifile, errbuf) != eslOK)) esl_fatal(errbuf); } if(esl_opt_IsOn(go, "--pcinfo")) { if(pp_ct == NULL) esl_fatal("Error: --pcinfo requires all alignments have #=GR PP annotation, but alignment %d does not", nali); if((status = dump_posterior_column_info(pcinfofp, pp_ct, use_weights, nali, alen, nseq, i_am_rf, msa->name, alifile, errbuf) != eslOK)) esl_fatal(errbuf); } if(esl_opt_IsOn(go, "--psinfo")) { if(msa->pp == NULL) esl_fatal("Error: --psinfo requires all alignments have #=GR PP annotation, but alignment %d does not", nali); if((status = dump_posterior_sequence_info(psinfofp, msa, nali, alifile, errbuf) != eslOK)) esl_fatal(errbuf); } if( esl_opt_IsOn(go, "--iinfo")) { if(msa->rf == NULL) esl_fatal("--iinfo requires all alignments have #=GC RF annotation, but alignment %d does not", nali); if(esl_opt_GetBoolean(go, "--weight") && msa->wgt == NULL) esl_fatal("--weight requires all alignments have #=GS WT annotation, but aln %d does not", nali); if((status = dump_insert_info(iinfofp, msa, use_weights, nali, i_am_rf, alifile, errbuf) != eslOK)) esl_fatal(errbuf); } if( esl_opt_IsOn(go, "--cinfo")) { if((status = dump_column_residue_counts(cinfofp, abc, abc_ct, esl_opt_GetBoolean(go, "--noambig"), use_weights, nali, alen, nseq, msa->name, alifile, errbuf) != eslOK)) esl_fatal(errbuf); } if( esl_opt_IsOn(go, "--bpinfo")) { if(msa->ss_cons == NULL) esl_fatal("--bpinfo requires all alignments have #=GC SS_cons annotation, but alignment %d does not", nali); if((status = dump_basepair_counts(bpinfofp, msa, abc, bp_ct, use_weights, nali, nseq, msa->name, alifile, errbuf) != eslOK)) esl_fatal(errbuf); } esl_msa_Destroy(msa); if(abc_ct != NULL) { esl_Free2D((void **) abc_ct, alen); abc_ct = NULL; } if(bp_ct != NULL) { esl_Free3D((void ***) bp_ct, alen, abc->Kp); bp_ct = NULL; } if(pp_ct != NULL) { esl_Free2D((void **) pp_ct, alen); pp_ct = NULL; } if(i_am_rf != NULL) { free(i_am_rf); i_am_rf = NULL; } if(rf2a_map != NULL) { free(rf2a_map); rf2a_map = NULL; } } /* If an msa read failed, we've dropped out to here with an informative status code. * we have to handle failures from new vs. legacy msa parsing differently */ if (esl_opt_GetBoolean(go, "--small")) { if (status == eslEFORMAT) esl_fatal("Alignment file parse error, line %d of file %s:\n%s\nOffending line is:\n%s\n", old_afp->linenumber, old_afp->fname, old_afp->errbuf, old_afp->buf); else if (status != eslEOF) esl_fatal("Alignment file read failed with error code %d\n", status); else if (nali == 0) esl_fatal("No alignments found in file %s\n", alifile); } else { if (nali == 0 || status != eslEOF) eslx_msafile_ReadFailure(afp, status); } /* Cleanup, normal return */ if(listfp != NULL) { fclose(listfp); printf("# List of sequences in %d alignment(s) saved to file %s\n", nali, esl_opt_GetString(go, "--list")); } if(icinfofp != NULL) { fclose(icinfofp); printf("# Information content data saved to file %s.\n", esl_opt_GetString(go, "--icinfo")); } if(rinfofp != NULL) { fclose(rinfofp); printf("# Residue data saved to file %s.\n", esl_opt_GetString(go, "--rinfo")); } if(pcinfofp != NULL) { fclose(pcinfofp); printf("# Per-column posterior probability data saved to file %s.\n", esl_opt_GetString(go, "--pcinfo")); } if(psinfofp != NULL) { fclose(psinfofp); printf("# Per-sequence posterior probability data saved to file %s.\n", esl_opt_GetString(go, "--psinfo")); } if(iinfofp != NULL) { printf("# Insert data saved to file %s.\n", esl_opt_GetString(go, "--iinfo")); fclose(iinfofp); } if(cinfofp != NULL) { printf("# Per-column counts data saved to file %s.\n", esl_opt_GetString(go, "--cinfo")); fclose(cinfofp); } if(bpinfofp != NULL) { printf("# Per-column basepair counts data saved to file %s.\n", esl_opt_GetString(go, "--bpinfo")); fclose(bpinfofp); } if (afp) eslx_msafile_Close(afp); if (old_afp) esl_msafile2_Close(old_afp); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return 0; }