static void utest_Diagonalization(void) { ESL_DMATRIX *P = NULL; ESL_DMATRIX *P2 = NULL; ESL_DMATRIX *C = NULL; ESL_DMATRIX *D = NULL; double *lambda = NULL; /* eigenvalues */ ESL_DMATRIX *U = NULL; /* left eigenvectors */ ESL_DMATRIX *Ui = NULL; /* inverse of U */ int i,j; /* Create a J/C probability matrix for t=1: * 1/4 + 3/4 e^{-4/3 at} * 1/4 - 1/4 e^{-4/3 at} */ if ((P = esl_dmatrix_Create(4, 4)) == NULL) esl_fatal("malloc failed"); if ((C = esl_dmatrix_Create(4, 4)) == NULL) esl_fatal("malloc failed"); if ((Ui = esl_dmatrix_Create(4, 4)) == NULL) esl_fatal("malloc failed"); if ((D = esl_dmatrix_Create(4, 4)) == NULL) esl_fatal("malloc failed"); if ((P2 = esl_dmatrix_Create(4, 4)) == NULL) esl_fatal("malloc failed"); for (i = 0; i < 4; i++) for (j = 0; j < 4; j++) if (i == j) P->mx[i][j] = 0.25 + 0.75 * exp(-4./3.); else P->mx[i][j] = 0.25 - 0.25 * exp(-4./3.); /* Diagonalize it */ if (esl_dmx_Diagonalize(P, &lambda, NULL, &U, NULL) != eslOK) esl_fatal("diagonalization failed"); /* Calculate P^k by U [diag(lambda_i)]^k U^{-1} */ esl_dmatrix_SetZero(D); for (i = 0; i < P->n; i++) D->mx[i][i] = lambda[i]; esl_dmx_Invert(U, Ui); esl_dmx_Multiply(U, D, C); esl_dmx_Multiply(C, Ui, P2); if (esl_dmatrix_Compare(P, P2, 1e-7) != eslOK) esl_fatal("diagonalization unit test failed"); free(lambda); esl_dmatrix_Destroy(P2); esl_dmatrix_Destroy(Ui); esl_dmatrix_Destroy(U); esl_dmatrix_Destroy(D); esl_dmatrix_Destroy(C); esl_dmatrix_Destroy(P); return; }
/* Function: esl_paml_ReadE() * Incept: SRE, Fri Jul 9 09:27:24 2004 [St. Louis] * * Purpose: Read an amino acid rate matrix in PAML format from stream * <fp>. Return it in two pieces: the symmetric E * exchangeability matrix in <E>, and the stationary * probability vector $\pi$ in <pi>. * Caller provides the memory for both <E> and <pi>. <E> * is a $20 \times 20$ matrix allocated as * <esl_dmatrix_Create(20, 20)>. <pi> is an array with * space for at least 20 doubles. * * The <E> matrix is symmetric for off-diagonal elements: * $E_{ij} = E_{ij}$ for $i \neq j$. The on-diagonal * elements $E_{ii}$ are not valid and should not be * accessed. (They are set to zero.) * The rate matrix will later be obtained from <E> * and <pi> as * $Q_{ij} = E_{ij} \pi_j$ for $i \neq j$ * and * $Q_{ii} = -\sum_{j \neq i} Q_{ij}$ * then scaled to units of one * substitution/site; see <esl_ratemx_E2Q()> and * <esl_ratemx_ScaleTo()>. * * Data file format: First 190 numbers are a * lower-triangular matrix E of amino acid * exchangeabilities $E_{ij}$. Next 20 numbers are the * amino acid frequencies $\pi_i$. Remainder of the * datafile is ignored. * * The alphabet order in the matrix and the frequency * vector is assumed to be "ARNDCQEGHILKMFPSTWYV" * (alphabetical by three-letter code), which appears to be * PAML's default order. This is transformed to Easel's * "ACDEFGHIKLMNPQRSTVWY" (alphabetical by one-letter code) * in the $E_{ij}$ and $\pi_i$ that are returned. * * Args: fp - open datafile for reading. * E - RETURN: E matrix of amino acid exchangeabilities e_ij, * symmetric (E_ij = E_ji), * in Easel amino acid alphabet order A..Y. * Caller provides appropriately allocated space. * pi - RETURN: \pi_i vector of amino acid frequencies, * in Easel amino acid alphabet order A..Y. * Caller provides appropriately allocated space. * * Returns: <eslOK> on success. * Returns <eslEOF> on premature end of file (parse failed), in which * case the contents of <E> and <pi> are undefined. * * Throws: <eslEMEM> on internal allocation failure, * and the contents of <E> and <pi> are undefined. * * Xref: STL8/p.56. */ int esl_paml_ReadE(FILE *fp, ESL_DMATRIX *E, double *pi) { int status; ESL_FILEPARSER *efp = NULL; char *tok; int i,j; char *pamlorder = "ARNDCQEGHILKMFPSTWYV"; char *eslorder = "ACDEFGHIKLMNPQRSTVWY"; int perm[20]; if ((status = esl_dmatrix_SetZero(E)) != eslOK) goto ERROR; esl_vec_DSet(pi, 20, 0.); if ((efp = esl_fileparser_Create(fp)) == NULL) goto ERROR; if ((status = esl_fileparser_SetCommentChar(efp, '#')) != eslOK) goto ERROR; /* Construct the alphabet permutation we need. * perm[i] -> original row/column i goes to row/column perm[i] */ for (i = 0; i < 20; i++) perm[i] = (int) (strchr(eslorder, pamlorder[i]) - eslorder); /* Read the s_ij matrix data in, permuting as we go. */ for (i = 1; i < 20; i++) for (j = 0; j < i; j++) { if ((status = esl_fileparser_GetToken(efp, &tok, NULL)) != eslOK) goto ERROR; E->mx[perm[i]][perm[j]] = atof(tok); E->mx[perm[j]][perm[i]] = E->mx[perm[i]][perm[j]]; } /* Read the pi_i vector in, permuting as we read. */ for (i = 0; i < 20; i++) { if ((status = esl_fileparser_GetToken(efp, &tok, NULL)) != eslOK) goto ERROR; pi[perm[i]] = atof(tok); } esl_fileparser_Destroy(efp); return eslOK; ERROR: if (efp != NULL) esl_fileparser_Destroy(efp); return status; }
int main(int argc, char **argv) { ESL_ALPHABET *abc = NULL; /* sequence alphabet */ ESL_GETOPTS *go = NULL; /* command line processing */ ESL_RANDOMNESS *r = NULL; /* source of randomness */ P7_HMM *hmm = NULL; /* sampled HMM to emit from */ P7_HMM *core = NULL; /* safe copy of the HMM, before config */ P7_BG *bg = NULL; /* null model */ ESL_SQ *sq = NULL; /* sampled sequence */ P7_TRACE *tr = NULL; /* sampled trace */ P7_PROFILE *gm = NULL; /* profile */ int i,j; int i1,i2; int k1,k2; int iseq; FILE *fp = NULL; double expected; int do_ilocal; char *hmmfile = NULL; int nseq; int do_swlike; int do_ungapped; int L; int M; int do_h2; char *ipsfile = NULL; char *kpsfile = NULL; ESL_DMATRIX *imx = NULL; ESL_DMATRIX *kmx = NULL; ESL_DMATRIX *iref = NULL; /* reference matrix: expected i distribution under ideality */ int Lbins; int status; char errbuf[eslERRBUFSIZE]; /***************************************************************** * Parse the command line *****************************************************************/ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) esl_fatal("Failed to parse command line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) esl_fatal("Failed to parse command line: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h") == TRUE) { puts(usage); puts("\n where options are:\n"); esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=all docgroups; 2 = indentation; 80=textwidth*/ return eslOK; } do_ilocal = esl_opt_GetBoolean(go, "-i"); hmmfile = esl_opt_GetString (go, "-m"); nseq = esl_opt_GetInteger(go, "-n"); do_swlike = esl_opt_GetBoolean(go, "-s"); do_ungapped = esl_opt_GetBoolean(go, "-u"); L = esl_opt_GetInteger(go, "-L"); M = esl_opt_GetInteger(go, "-M"); do_h2 = esl_opt_GetBoolean(go, "-2"); ipsfile = esl_opt_GetString (go, "--ips"); kpsfile = esl_opt_GetString (go, "--kps"); if (esl_opt_ArgNumber(go) != 0) { puts("Incorrect number of command line arguments."); printf("Usage: %s [options]\n", argv[0]); return eslFAIL; } r = esl_randomness_CreateFast(0); if (hmmfile != NULL) { /* Read the HMM (and get alphabet from it) */ P7_HMMFILE *hfp = NULL; status = p7_hmmfile_OpenE(hmmfile, NULL, &hfp, errbuf); if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status == eslEFORMAT) p7_Fail("File format problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, hmmfile, errbuf); if ((status = p7_hmmfile_Read(hfp, &abc, &hmm)) != eslOK) { if (status == eslEOD) esl_fatal("read failed, HMM file %s may be truncated?", hmmfile); else if (status == eslEFORMAT) esl_fatal("bad file format in HMM file %s", hmmfile); else if (status == eslEINCOMPAT) esl_fatal("HMM file %s contains different alphabets", hmmfile); else esl_fatal("Unexpected error in reading HMMs"); } M = hmm->M; p7_hmmfile_Close(hfp); } else { /* Or sample the HMM (create alphabet first) */ abc = esl_alphabet_Create(eslAMINO); if (do_ungapped) p7_hmm_SampleUngapped(r, M, abc, &hmm); else if (do_swlike) p7_hmm_SampleUniform (r, M, abc, 0.05, 0.5, 0.05, 0.2, &hmm); /* tmi, tii, tmd, tdd */ else p7_hmm_Sample (r, M, abc, &hmm); } Lbins = M; imx = esl_dmatrix_Create(Lbins, Lbins); iref = esl_dmatrix_Create(Lbins, Lbins); kmx = esl_dmatrix_Create(M, M); esl_dmatrix_SetZero(imx); esl_dmatrix_SetZero(iref); esl_dmatrix_SetZero(kmx); tr = p7_trace_Create(); sq = esl_sq_CreateDigital(abc); bg = p7_bg_Create(abc); core = p7_hmm_Clone(hmm); if (do_h2) { gm = p7_profile_Create(hmm->M, abc); p7_H2_ProfileConfig(hmm, bg, gm, p7_UNILOCAL); } else { gm = p7_profile_Create(hmm->M, abc); p7_ProfileConfig(hmm, bg, gm, L, p7_UNILOCAL); if (p7_hmm_Validate (hmm, NULL, 0.0001) != eslOK) esl_fatal("whoops, HMM is bad!"); if (p7_profile_Validate(gm, NULL, 0.0001) != eslOK) esl_fatal("whoops, profile is bad!"); } /* Sample endpoints. * Also sample an ideal reference distribution for i endpoints. i * endpoints are prone to discretization artifacts, when emitted * sequences have varying lengths. Taking log odds w.r.t. an ideal * reference that is subject to the same discretization artifacts * cancels out the effect. */ for (iseq = 0; iseq < nseq; iseq++) { if (do_ilocal) ideal_local_endpoints (r, core, sq, tr, Lbins, &i1, &i2, &k1, &k2); else profile_local_endpoints(r, core, gm, sq, tr, Lbins, &i1, &i2, &k1, &k2); imx->mx[i1-1][i2-1] += 1.; kmx->mx[k1-1][k2-1] += 1.; /* reference distribution for i */ ideal_local_endpoints (r, core, sq, tr, Lbins, &i1, &i2, &k1, &k2); iref->mx[i1-1][i2-1] += 1.; } /* Adjust both mx's to log_2(obs/exp) ratio */ printf("Before normalization/log-odds:\n"); printf(" i matrix values range from %f to %f\n", dmx_upper_min(imx), dmx_upper_max(imx)); printf(" k matrix values range from %f to %f\n", dmx_upper_min(kmx), dmx_upper_max(kmx)); printf("iref matrix values range from %f to %f\n", dmx_upper_min(iref), dmx_upper_max(iref)); expected = (double) nseq * 2. / (double) (M*(M+1)); for (i = 0; i < kmx->m; i++) for (j = i; j < kmx->n; j++) kmx->mx[i][j] = log(kmx->mx[i][j] / expected) / log(2.0); for (i = 0; i < imx->m; i++) for (j = i; j < imx->m; j++) if (iref->mx[i][j] == 0. && imx->mx[i][j] == 0.) imx->mx[i][j] = 0.; else if (iref->mx[i][j] == 0.) imx->mx[i][j] = eslINFINITY; else if (imx->mx[i][j] == 0.) imx->mx[i][j] = -eslINFINITY; else imx->mx[i][j] = log(imx->mx[i][j] / iref->mx[i][j]) / log(2.0); /* Print ps files */ if (kpsfile != NULL) { if ((fp = fopen(kpsfile, "w")) == NULL) esl_fatal("Failed to open output postscript file %s", kpsfile); dmx_Visualize(fp, kmx, -4., 5.); fclose(fp); } if (ipsfile != NULL) { if ((fp = fopen(ipsfile, "w")) == NULL) esl_fatal("Failed to open output postscript file %s", ipsfile); dmx_Visualize(fp, imx, -4., 5.); /* dmx_Visualize(fp, imx, dmx_upper_min(imx), dmx_upper_max(imx)); */ fclose(fp); } printf("After normalization/log-odds:\n"); printf("i matrix values range from %f to %f\n", dmx_upper_min(imx), dmx_upper_max(imx)); printf("k matrix values range from %f to %f\n", dmx_upper_min(kmx), dmx_upper_max(kmx)); p7_profile_Destroy(gm); p7_bg_Destroy(bg); p7_hmm_Destroy(core); p7_hmm_Destroy(hmm); p7_trace_Destroy(tr); esl_sq_Destroy(sq); esl_dmatrix_Destroy(imx); esl_dmatrix_Destroy(kmx); esl_alphabet_Destroy(abc); esl_randomness_Destroy(r); esl_getopts_Destroy(go); return eslOK; }