/* multifetch: * given a file containing lines with one name or key per line; * parse the file line-by-line; * if we have an SSI index available, retrieve the HMMs by key * as we see each line; * else, without an SSI index, store the keys in a hash, then * read the entire HMM file in a single pass, outputting HMMs * that are in our keylist. * * Note that with an SSI index, you get the HMMs in the order they * appear in the <keyfile>, but without an SSI index, you get HMMs in * the order they occur in the HMM file. */ static void multifetch(ESL_GETOPTS *go, FILE *ofp, char *keyfile, P7_HMMFILE *hfp) { ESL_KEYHASH *keys = esl_keyhash_Create(); ESL_FILEPARSER *efp = NULL; ESL_ALPHABET *abc = NULL; P7_HMM *hmm = NULL; int nhmm = 0; char *key; int keylen; int keyidx; int status; if (esl_fileparser_Open(keyfile, NULL, &efp) != eslOK) p7_Fail("Failed to open key file %s\n", keyfile); esl_fileparser_SetCommentChar(efp, '#'); while (esl_fileparser_NextLine(efp) == eslOK) { if (esl_fileparser_GetTokenOnLine(efp, &key, &keylen) != eslOK) p7_Fail("Failed to read HMM name on line %d of file %s\n", efp->linenumber, keyfile); status = esl_key_Store(keys, key, &keyidx); if (status == eslEDUP) p7_Fail("HMM key %s occurs more than once in file %s\n", key, keyfile); if (hfp->ssi != NULL) { onefetch(go, ofp, key, hfp); nhmm++; } } if (hfp->ssi == NULL) { while ((status = p7_hmmfile_Read(hfp, &abc, &hmm)) != eslEOF) { if (status == eslEOD) p7_Fail("read failed, HMM file %s may be truncated?", hfp->fname); else if (status == eslEFORMAT) p7_Fail("bad file format in HMM file %s", hfp->fname); else if (status == eslEINCOMPAT) p7_Fail("HMM file %s contains different alphabets", hfp->fname); else if (status != eslOK) p7_Fail("Unexpected error in reading HMMs from %s", hfp->fname); if (esl_key_Lookup(keys, hmm->name, &keyidx) == eslOK || ((hmm->acc) && esl_key_Lookup(keys, hmm->acc, &keyidx) == eslOK)) { p7_hmmfile_WriteASCII(ofp, -1, hmm); nhmm++; } p7_hmm_Destroy(hmm); } } if (ofp != stdout) printf("\nRetrieved %d HMMs.\n", nhmm); if (abc != NULL) esl_alphabet_Destroy(abc); esl_keyhash_Destroy(keys); esl_fileparser_Close(efp); return; }
static int output_result(const struct cfg_s *cfg, char *errbuf, int msaidx, ESL_MSA *msa, P7_HMM *hmm, ESL_MSA *postmsa, double entropy) { int status; /* Special case: output the tabular results header. * Arranged this way to keep the two fprintf()'s close together in the code, * so we can keep the data and labels properly sync'ed. */ if (msa == NULL) { fprintf(cfg->ofp, "#%4s %-20s %5s %5s %5s %8s %6s %s\n", " idx", "name", "nseq", "alen", "mlen", "eff_nseq", "re/pos", "description"); fprintf(cfg->ofp, "#%4s %-20s %5s %5s %5s %8s %6s %s\n", "----", "--------------------", "-----", "-----", "-----", "--------", "------", "-----------"); return eslOK; } if ((status = p7_hmm_Validate(hmm, errbuf, 0.0001)) != eslOK) return status; if ((status = p7_hmmfile_WriteASCII(cfg->hmmfp, -1, hmm)) != eslOK) ESL_FAIL(status, errbuf, "HMM save failed"); /* # name nseq alen M eff_nseq re/pos description*/ fprintf(cfg->ofp, "%-5d %-20s %5d %5" PRId64 " %5d %8.2f %6.3f %s\n", msaidx, (msa->name != NULL) ? msa->name : "", msa->nseq, msa->alen, hmm->M, hmm->eff_nseq, entropy, (msa->desc != NULL) ? msa->desc : ""); if (cfg->postmsafp != NULL && postmsa != NULL) { esl_msa_Write(cfg->postmsafp, postmsa, eslMSAFILE_STOCKHOLM); } return eslOK; }
/* onefetch(): * Given one <key> (an HMM name or accession), retrieve the corresponding HMM. * In SSI mode, we can do this quickly by positioning the file, then reading * and writing the HMM that's at that position. * Without an SSI index, we have to parse the HMMs sequentially 'til we find * the one we're after. */ static void onefetch(ESL_GETOPTS *go, FILE *ofp, char *key, P7_HMMFILE *hfp) { ESL_ALPHABET *abc = NULL; P7_HMM *hmm = NULL; int status; if (hfp->ssi != NULL) { status = p7_hmmfile_PositionByKey(hfp, key); if (status == eslENOTFOUND) p7_Fail("HMM %s not found in SSI index for file %s\n", key, hfp->fname); else if (status == eslEFORMAT) p7_Fail("Failed to parse SSI index for %s\n", hfp->fname); else if (status != eslOK) p7_Fail("Failed to look up location of HMM %s in SSI index of file %s\n", key, hfp->fname); } while ((status = p7_hmmfile_Read(hfp, &abc, &hmm)) != eslEOF) { if (status == eslEOD) p7_Fail("read failed, HMM file %s may be truncated?", hfp->fname); else if (status == eslEFORMAT) p7_Fail("bad file format in HMM file %s", hfp->fname); else if (status == eslEINCOMPAT) p7_Fail("HMM file %s contains different alphabets", hfp->fname); else if (status != eslOK) p7_Fail("Unexpected error in reading HMMs from %s", hfp->fname); if (strcmp(key, hmm->name) == 0 || (hmm->acc && strcmp(key, hmm->acc) == 0)) break; p7_hmm_Destroy(hmm); hmm = NULL; } if (status == eslOK) { p7_hmmfile_WriteASCII(ofp, -1, hmm); p7_hmm_Destroy(hmm); } else p7_Fail("HMM %s not found in file %s\n", key, hfp->fname); esl_alphabet_Destroy(abc); }
static void utest_ReadWrite(P7_HMM *hmm, P7_OPROFILE *om) { char *msg = "oprofile read/write unit test failure"; ESL_ALPHABET *abc = NULL; P7_OPROFILE *om2 = NULL; char tmpfile[16] = "esltmpXXXXXX"; char *mfile = NULL; char *ffile = NULL; char *pfile = NULL; char *ssifile = NULL; FILE *fp = NULL; FILE *mfp = NULL; FILE *ffp = NULL; FILE *pfp = NULL; ESL_NEWSSI *nssi = NULL; P7_HMMFILE *hfp = NULL; uint16_t fh = 0; float tolerance = 0.001; char errbuf[eslERRBUFSIZE]; /* 1. A mini version of hmmpress: save the test HMM to a file along with its associated .h3{mfpi} files */ if ( esl_tmpfile_named(tmpfile, &fp) != eslOK) esl_fatal(msg); if ( esl_sprintf(&mfile, "%s.h3m", tmpfile) != eslOK) esl_fatal(msg); if ( esl_sprintf(&ffile, "%s.h3f", tmpfile) != eslOK) esl_fatal(msg); if ( esl_sprintf(&pfile, "%s.h3p", tmpfile) != eslOK) esl_fatal(msg); if ( esl_sprintf(&ssifile, "%s.h3i", tmpfile) != eslOK) esl_fatal(msg); if ( esl_newssi_Open(ssifile, TRUE, &nssi) != eslOK) esl_fatal(msg); if (( mfp = fopen(mfile, "wb")) == NULL) esl_fatal(msg); if (( ffp = fopen(ffile, "wb")) == NULL) esl_fatal(msg); if (( pfp = fopen(pfile, "wb")) == NULL) esl_fatal(msg); /* the disk offsets are all 0 by construction, if there's only one * HMM in the file - but don't want to forget them, if we change the * unit test in the future to be multi HMM */ if ((om->offs[p7_MOFFSET] = ftello(mfp)) == -1) esl_fatal(msg); if ((om->offs[p7_FOFFSET] = ftello(ffp)) == -1) esl_fatal(msg); if ((om->offs[p7_POFFSET] = ftello(pfp)) == -1) esl_fatal(msg); if ( p7_hmmfile_WriteASCII(fp, -1, hmm) != eslOK) esl_fatal(msg); if ( p7_hmmfile_WriteBinary(mfp, -1, hmm) != eslOK) esl_fatal(msg); if ( p7_oprofile_Write(ffp, pfp, om) != eslOK) esl_fatal(msg); if ( esl_newssi_AddFile(nssi, tmpfile, 0, &fh) != eslOK) esl_fatal(msg); if ( esl_newssi_AddKey (nssi, hmm->name, fh, om->offs[p7_MOFFSET], 0, 0) != eslOK) esl_fatal(msg); if ( esl_newssi_Write(nssi) != eslOK) esl_fatal(msg); fclose(fp); fclose(mfp); fclose(ffp); fclose(pfp); esl_newssi_Close(nssi); /* 2. read the optimized profile back in */ if ( p7_hmmfile_Open(tmpfile, NULL, &hfp) != eslOK) esl_fatal(msg); if ( p7_oprofile_ReadMSV(hfp, &abc, &om2) != eslOK) esl_fatal(msg); if ( p7_oprofile_ReadRest(hfp, om2) != eslOK) esl_fatal(msg); /* 3. it should be identical to the original */ if ( p7_oprofile_Compare(om, om2, tolerance, errbuf) != eslOK) esl_fatal("%s\n%s", msg, errbuf); p7_oprofile_Destroy(om2); p7_hmmfile_Close(hfp); esl_alphabet_Destroy(abc); remove(ssifile); remove(ffile); remove(pfile); remove(mfile); remove(tmpfile); free(ssifile); free(mfile); free(ffile); free(pfile); }
/** * int main(int argc, char **argv) * Main driver */ int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* command line processing */ ESL_ALPHABET *abc = NULL; char *hmmfile = NULL; char *outhmmfile = NULL; P7_HMMFILE *hfp = NULL; FILE *outhmmfp; /* HMM output file handle */ P7_HMM *hmm = NULL; P7_BG *bg = NULL; int nhmm; double x; float KL; int status; char errbuf[eslERRBUFSIZE]; float average_internal_transitions[ p7H_NTRANSITIONS ]; int k; char errmsg[eslERRBUFSIZE]; /* Process the command line options. */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK || esl_opt_VerifyConfig(go) != eslOK) { printf("Failed to parse command line: %s\n", go->errbuf); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if (esl_opt_GetBoolean(go, "-h") == TRUE) { profillic_p7_banner(stdout, argv[0], banner); esl_usage(stdout, argv[0], usage); puts("\nOptions:"); esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=docgroup, 2 = indentation; 80=textwidth*/ exit(0); } if (esl_opt_ArgNumber(go) != 2) { puts("Incorrect number of command line arguments."); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if ((hmmfile = esl_opt_GetArg(go, 1)) == NULL) { puts("Failed to read <input hmmfile> argument from command line."); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if ((outhmmfile = esl_opt_GetArg(go, 2)) == NULL) { puts("Failed to read <output hmmfile> argument from command line."); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } profillic_p7_banner(stdout, argv[0], banner); /* Initializations: open the input HMM file for reading */ status = p7_hmmfile_OpenE(hmmfile, NULL, &hfp, errbuf); if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status == eslEFORMAT) p7_Fail("File format problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, hmmfile, errbuf); /* Initializations: open the output HMM file for writing */ if ((outhmmfp = fopen(outhmmfile, "w")) == NULL) ESL_FAIL(status, errmsg, "Failed to open HMM file %s for writing", outhmmfile); /* Main body: read HMMs one at a time, print one line of stats */ printf("#\n"); printf("# %-4s %-20s %-12s %8s %8s %6s %6s %6s %6s %6s\n", "idx", "name", "accession", "nseq", "eff_nseq", "M", "relent", "info", "p relE", "compKL"); printf("# %-4s %-20s %-12s %8s %8s %6s %6s %6s %6s %6s\n", "----", "--------------------", "------------", "--------", "--------", "------", "------", "------", "------", "------"); nhmm = 0; while ((status = p7_hmmfile_Read(hfp, &abc, &hmm)) != eslEOF) { if (status == eslEOD) esl_fatal("read failed, HMM file %s may be truncated?", hmmfile); else if (status == eslEFORMAT) esl_fatal("bad file format in HMM file %s", hmmfile); else if (status == eslEINCOMPAT) esl_fatal("HMM file %s contains different alphabets", hmmfile); else if (status != eslOK) esl_fatal("Unexpected error in reading HMMs from %s", hmmfile); nhmm++; if (bg == NULL) bg = p7_bg_Create(abc); esl_vec_FSet(average_internal_transitions, p7H_NTRANSITIONS, 0.); for( k = 1; k < hmm->M; k++ ) { esl_vec_FAdd(average_internal_transitions, hmm->t[k], p7H_NTRANSITIONS); } // Match transitions esl_vec_FNorm(average_internal_transitions, 3); // Insert transitions esl_vec_FNorm(average_internal_transitions + 3, 2); // Delete transitions esl_vec_FNorm(average_internal_transitions + 5, 2); // Ok now set them. for( k = 1; k < hmm->M; k++ ) { esl_vec_FCopy( average_internal_transitions, p7H_NTRANSITIONS, hmm->t[k] ); } if ((status = p7_hmm_Validate(hmm, errmsg, 0.0001)) != eslOK) return status; if ((status = p7_hmmfile_WriteASCII(outhmmfp, -1, hmm)) != eslOK) ESL_FAIL(status, errmsg, "HMM save failed"); p7_MeanPositionRelativeEntropy(hmm, bg, &x); p7_hmm_CompositionKLDist(hmm, bg, &KL, NULL); printf("%-6d %-20s %-12s %8d %8.2f %6d %6.2f %6.2f %6.2f %6.2f\n", nhmm, hmm->name, hmm->acc == NULL ? "-" : hmm->acc, hmm->nseq, hmm->eff_nseq, hmm->M, p7_MeanMatchRelativeEntropy(hmm, bg), p7_MeanMatchInfo(hmm, bg), x, KL); /* p7_MeanForwardScore(hmm, bg)); */ p7_hmm_Destroy(hmm); } p7_bg_Destroy(bg); esl_alphabet_Destroy(abc); p7_hmmfile_Close(hfp); if (outhmmfp != NULL) fclose(outhmmfp); esl_getopts_Destroy(go); exit(0); }
int main(int argc, char **argv) { ESL_GETOPTS *go = p7_CreateDefaultApp(options, 2, argc, argv, banner, usage); ESL_ALPHABET *abc = esl_alphabet_Create(eslAMINO); char *hmmfile = esl_opt_GetArg(go, 1); char *qfile = esl_opt_GetArg(go, 2); ESL_SQ *qsq = esl_sq_CreateDigital(abc); ESL_SQFILE *qfp = NULL; FILE *hmmfp = NULL; ESL_SCOREMATRIX *S = esl_scorematrix_Create(abc); ESL_DMATRIX *Q = NULL; P7_BG *bg = p7_bg_Create(abc); P7_HMM *hmm = NULL; double *fa = NULL; double popen = esl_opt_GetReal (go, "-q"); double pextend = esl_opt_GetReal (go, "-r"); char *mxfile = esl_opt_GetString(go, "-m"); char errbuf[eslERRBUFSIZE]; double slambda; int a,b; int status; /* Reverse engineer a scoring matrix to obtain conditional prob's * that we'll use for the single-seq query HMM. Because score mx is * symmetric, we can set up P[a][b] = P(b | a), so we can use the * matrix rows as HMM match emission vectors. This means dividing * the joint probs through by f_a. */ if (mxfile == NULL) { if (esl_scorematrix_Set("BLOSUM62", S) != eslOK) esl_fatal("failed to set BLOSUM62 scores"); } else { ESL_FILEPARSER *efp = NULL; if ( esl_fileparser_Open(mxfile, NULL, &efp) != eslOK) esl_fatal("failed to open score file %s", mxfile); if ( esl_scorematrix_Read(efp, abc, &S) != eslOK) esl_fatal("failed to read matrix from %s", mxfile); esl_fileparser_Close(efp); } /* A wasteful conversion of the HMMER single-precision background probs to Easel double-prec */ ESL_ALLOC(fa, sizeof(double) * bg->abc->K); esl_vec_F2D(bg->f, bg->abc->K, fa); /* Backcalculate joint probabilities Q, given score matrix S and background frequencies fa */ status = esl_scorematrix_ProbifyGivenBG(S, fa, fa, &slambda, &Q); if (status == eslEINVAL) esl_fatal("built-in score matrix %s has no valid solution for lambda", matrix); else if (status == eslENOHALT) esl_fatal("failed to solve score matrix %s for lambda", matrix); else if (status != eslOK) esl_fatal("unexpected error in solving score matrix %s for probability parameters", matrix); esl_scorematrix_JointToConditionalOnQuery(abc, Q); /* Open the query sequence file in FASTA format */ status = esl_sqfile_Open(qfile, eslSQFILE_FASTA, NULL, &qfp); if (status == eslENOTFOUND) esl_fatal("No such file %s.", qfile); else if (status == eslEFORMAT) esl_fatal("Format of %s unrecognized.", qfile); else if (status == eslEINVAL) esl_fatal("Can't autodetect stdin or .gz."); else if (status != eslOK) esl_fatal("Open of %s failed, code %d.", qfile, status); /* Open the output HMM file */ if ((hmmfp = fopen(hmmfile, "w")) == NULL) esl_fatal("Failed to open output HMM file %s", hmmfile); /* For each sequence, build a model and save it. */ while ((status = esl_sqio_Read(qfp, qsq)) == eslOK) { p7_Seqmodel(abc, qsq->dsq, qsq->n, qsq->name, Q, bg->f, popen, pextend, &hmm); if ( p7_hmm_Validate(hmm, errbuf, 1e-5) != eslOK) esl_fatal("HMM validation failed: %s\n", errbuf); if ( p7_hmmfile_WriteASCII(hmmfp, -1, hmm) != eslOK) esl_fatal("HMM save failed"); p7_hmm_Destroy(hmm); } if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s line %" PRId64 "):\n%s\n", qfp->filename, qfp->linenumber, qfp->errbuf); else if (status != eslEOF) esl_fatal("Unexpected error %d reading sequence file %s", status, qfp->filename); esl_dmatrix_Destroy(Q); esl_scorematrix_Destroy(S); free(fa); free(fb); esl_sq_Destroy(qsq); esl_sqfile_Close(qfp); fclose(hmmfp); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return 0; }