int main(int argc, char **argv) { ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage); char *msafile = esl_opt_GetArg(go, 1); ESL_ALPHABET *abc = NULL; int infmt = eslMSAFILE_UNKNOWN; ESLX_MSAFILE *afp = NULL; ESL_MSA *msa = NULL; FILE *ofp = stdout; int nali = 0; int namewidth; double pid; int nid, n; int i,j; int status; /* allow user to assert the input MSA alphabet */ if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA); else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO); /* allow user to assert the input MSA format */ if (esl_opt_IsOn(go, "--informat") && (infmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--informat"))) == eslMSAFILE_UNKNOWN) esl_fatal("%s is not a valid MSA file format for --informat", esl_opt_GetString(go, "--informat")); /* digital open */ if ( ( status = eslx_msafile_Open(&abc, msafile, NULL, infmt, NULL, &afp)) != eslOK) eslx_msafile_OpenFailure(afp, status); while ((status = eslx_msafile_Read(afp, &msa)) == eslOK) { nali++; namewidth = esl_str_GetMaxWidth(msa->sqname, msa->nseq); for (i = 0; i < msa->nseq; i++) for (j = i+1; j < msa->nseq; j++) { esl_dst_XPairId(abc, msa->ax[i], msa->ax[j], &pid, &nid, &n); fprintf(ofp, "%-*s %-*s %6.2f %6d %6d\n", namewidth, msa->sqname[i], namewidth, msa->sqname[j], pid*100.0, nid, n); } esl_msa_Destroy(msa); } if (nali == 0 || status != eslEOF) eslx_msafile_ReadFailure(afp, status); eslx_msafile_Close(afp); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return 0; }
void worker_process(ESL_GETOPTS *go) { HMMD_COMMAND *cmd = NULL; /* see hmmpgmd.h */ int shutdown = 0; WORKER_ENV env; int status; /* Initializations */ impl_Init(); p7_FLogsumInit(); /* we're going to use table-driven Logsum() approximations at times */ if (esl_opt_IsOn(go, "--cpu")) env.ncpus = esl_opt_GetInteger(go, "--cpu"); else esl_threads_CPUCount(&env.ncpus); env.hmm_db = NULL; env.seq_db = NULL; env.fd = setup_masterside_comm(go); while (!shutdown) { if ((status = read_Command(&cmd, &env)) != eslOK) break; switch (cmd->hdr.command) { case HMMD_CMD_INIT: process_InitCmd (cmd, &env); break; case HMMD_CMD_SCAN: process_SearchCmd(cmd, &env); break; case HMMD_CMD_SEARCH: process_SearchCmd(cmd, &env); break; case HMMD_CMD_SHUTDOWN: process_Shutdown (cmd, &env); shutdown = 1; break; default: p7_syslog(LOG_ERR,"[%s:%d] - unknown command %d (%d)\n", __FILE__, __LINE__, cmd->hdr.command, cmd->hdr.length); } free(cmd); cmd = NULL; } if (env.hmm_db) p7_hmmcache_Close(env.hmm_db); if (env.seq_db) p7_seqcache_Close(env.seq_db); if (env.fd != -1) close(env.fd); return; }
int main(int argc, char **argv) { ESL_GETOPTS *go = p7_CreateDefaultApp(options, 1, argc, argv, banner, usage); char *hmmfile = esl_opt_GetArg(go, 1); ESL_RANDOMNESS *r = esl_randomness_CreateFast(esl_opt_GetInteger(go, "-s")); ESL_ALPHABET *abc = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; P7_OPROFILE *om = NULL; double lambda = 0.0; double mmu = 0.0; double vmu = 0.0; double ftau = 0.0; int Z = esl_opt_GetInteger(go, "-Z"); int EmL = esl_opt_GetInteger(go, "--EmL"); int EmN = esl_opt_GetInteger(go, "--EmN"); int EvL = esl_opt_GetInteger(go, "--EvL"); int EvN = esl_opt_GetInteger(go, "--EvN"); int EfL = esl_opt_GetInteger(go, "--EfL"); int EfN = esl_opt_GetInteger(go, "--EfN"); int Eft = esl_opt_GetReal (go, "--Eft"); int iteration; int do_msv, do_vit, do_fwd; int status; if (esl_opt_GetBoolean(go, "--msvonly") == TRUE) { do_msv = TRUE; do_vit = FALSE; do_fwd = FALSE; } else if (esl_opt_GetBoolean(go, "--vitonly") == TRUE) { do_msv = FALSE; do_vit = TRUE; do_fwd = FALSE; } else if (esl_opt_GetBoolean(go, "--fwdonly") == TRUE) { do_msv = FALSE; do_vit = FALSE; do_fwd = TRUE; } else { do_msv = TRUE; do_vit = TRUE; do_fwd = TRUE; } if (p7_hmmfile_OpenE(hmmfile, NULL, &hfp, NULL) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile); while ((status = p7_hmmfile_Read(hfp, &abc, &hmm)) != eslEOF) { if (bg == NULL) bg = p7_bg_Create(abc); gm = p7_profile_Create(hmm->M, abc); p7_ProfileConfig(hmm, bg, gm, EvL, p7_LOCAL); /* the EvL doesn't matter */ om = p7_oprofile_Create(hmm->M, abc); p7_oprofile_Convert(gm, om); if (esl_opt_IsOn(go, "--lambda")) lambda = esl_opt_GetReal(go, "--lambda"); else p7_Lambda(hmm, bg, &lambda); for (iteration = 0; iteration < Z; iteration++) { if (do_msv) p7_MSVMu (r, om, bg, EmL, EmN, lambda, &mmu); if (do_vit) p7_ViterbiMu (r, om, bg, EvL, EvN, lambda, &vmu); if (do_fwd) p7_Tau (r, om, bg, EfL, EfN, lambda, Eft, &ftau); printf("%s %.4f %.4f %.4f %.4f\n", hmm->name, lambda, mmu, vmu, ftau); } p7_hmm_Destroy(hmm); p7_profile_Destroy(gm); p7_oprofile_Destroy(om); } p7_hmmfile_Close(hfp); p7_bg_Destroy(bg); esl_alphabet_Destroy(abc); esl_randomness_Destroy(r); esl_getopts_Destroy(go); return eslOK; }
static void search_thread(void *arg) { int i; int count; int seed; int status; int workeridx; WORKER_INFO *info; ESL_THREADS *obj; ESL_SQ dbsq; ESL_STOPWATCH *w = NULL; /* timing stopwatch */ P7_BUILDER *bld = NULL; /* HMM construction configuration */ P7_BG *bg = NULL; /* null model */ P7_PIPELINE *pli = NULL; /* work pipeline */ P7_TOPHITS *th = NULL; /* top hit results */ P7_PROFILE *gm = NULL; /* generic model */ P7_OPROFILE *om = NULL; /* optimized query profile */ obj = (ESL_THREADS *) arg; esl_threads_Started(obj, &workeridx); info = (WORKER_INFO *) esl_threads_GetData(obj, workeridx); w = esl_stopwatch_Create(); bg = p7_bg_Create(info->abc); esl_stopwatch_Start(w); /* set up the dummy description and accession fields */ dbsq.desc = ""; dbsq.acc = ""; /* process a query sequence or hmm */ if (info->seq != NULL) { bld = p7_builder_Create(NULL, info->abc); if ((seed = esl_opt_GetInteger(info->opts, "--seed")) > 0) { esl_randomness_Init(bld->r, seed); bld->do_reseeding = TRUE; } bld->EmL = esl_opt_GetInteger(info->opts, "--EmL"); bld->EmN = esl_opt_GetInteger(info->opts, "--EmN"); bld->EvL = esl_opt_GetInteger(info->opts, "--EvL"); bld->EvN = esl_opt_GetInteger(info->opts, "--EvN"); bld->EfL = esl_opt_GetInteger(info->opts, "--EfL"); bld->EfN = esl_opt_GetInteger(info->opts, "--EfN"); bld->Eft = esl_opt_GetReal (info->opts, "--Eft"); if (esl_opt_IsOn(info->opts, "--mxfile")) status = p7_builder_SetScoreSystem (bld, esl_opt_GetString(info->opts, "--mxfile"), NULL, esl_opt_GetReal(info->opts, "--popen"), esl_opt_GetReal(info->opts, "--pextend"), bg); else status = p7_builder_LoadScoreSystem(bld, esl_opt_GetString(info->opts, "--mx"), esl_opt_GetReal(info->opts, "--popen"), esl_opt_GetReal(info->opts, "--pextend"), bg); if (status != eslOK) { //client_error(info->sock, status, "hmmgpmd: failed to set single query sequence score system: %s", bld->errbuf); fprintf(stderr, "hmmpgmd: failed to set single query sequence score system: %s", bld->errbuf); pthread_exit(NULL); return; } p7_SingleBuilder(bld, info->seq, bg, NULL, NULL, NULL, &om); /* bypass HMM - only need model */ p7_builder_Destroy(bld); } else { gm = p7_profile_Create (info->hmm->M, info->abc); om = p7_oprofile_Create(info->hmm->M, info->abc); p7_ProfileConfig(info->hmm, bg, gm, 100, p7_LOCAL); p7_oprofile_Convert(gm, om); } /* Create processing pipeline and hit list */ th = p7_tophits_Create(); pli = p7_pipeline_Create(info->opts, om->M, 100, FALSE, p7_SEARCH_SEQS); p7_pli_NewModel(pli, om, bg); if (pli->Z_setby == p7_ZSETBY_NTARGETS) pli->Z = info->db_Z; /* loop until all sequences have been processed */ count = 1; while (count > 0) { int inx; int blksz; HMMER_SEQ **sq; /* grab the next block of sequences */ if (pthread_mutex_lock(info->inx_mutex) != 0) p7_Fail("mutex lock failed"); inx = *info->inx; blksz = *info->blk_size; if (inx > *info->limit) { blksz /= 5; if (blksz < 1000) { *info->limit = info->sq_cnt * 2; } else { *info->limit = inx + (info->sq_cnt - inx) * 2 / 3; } } *info->blk_size = blksz; *info->inx += blksz; if (pthread_mutex_unlock(info->inx_mutex) != 0) p7_Fail("mutex unlock failed"); sq = info->sq_list + inx; count = info->sq_cnt - inx; if (count > blksz) count = blksz; /* Main loop: */ for (i = 0; i < count; ++i, ++sq) { if ( !(info->range_list) || hmmpgmd_IsWithinRanges ((*sq)->idx, info->range_list)) { dbsq.name = (*sq)->name; dbsq.dsq = (*sq)->dsq; dbsq.n = (*sq)->n; dbsq.idx = (*sq)->idx; if((*sq)->desc != NULL) dbsq.desc = (*sq)->desc; p7_bg_SetLength(bg, dbsq.n); p7_oprofile_ReconfigLength(om, dbsq.n); p7_Pipeline(pli, om, bg, &dbsq, th); p7_pipeline_Reuse(pli); } } } /* make available the pipeline objects to the main thread */ info->th = th; info->pli = pli; /* clean up */ p7_bg_Destroy(bg); p7_oprofile_Destroy(om); if (gm != NULL) p7_profile_Destroy(gm); esl_stopwatch_Stop(w); info->elapsed = w->elapsed; esl_stopwatch_Destroy(w); esl_threads_Finished(obj, workeridx); pthread_exit(NULL); return; }
/* serial_master() * The serial version of hmmbuild. * For each MSA, build an HMM and save it. * * A master can only return if it's successful. All errors are handled immediately and fatally with p7_Fail(). */ static int serial_master(const ESL_GETOPTS *go, struct cfg_s *cfg) { int status; int i; int ncpus = 0; int infocnt = 0; WORKER_INFO *info = NULL; #ifdef HMMER_THREADS WORK_ITEM *item = NULL; ESL_THREADS *threadObj= NULL; ESL_WORK_QUEUE *queue = NULL; #endif char errmsg[eslERRBUFSIZE]; if ((status = init_master_cfg(go, cfg, errmsg)) != eslOK) p7_Fail(errmsg); #ifdef HMMER_THREADS /* initialize thread data */ if (esl_opt_IsOn(go, "--cpu")) ncpus = esl_opt_GetInteger(go, "--cpu"); else esl_threads_CPUCount(&ncpus); if (ncpus > 0) { threadObj = esl_threads_Create(&pipeline_thread); queue = esl_workqueue_Create(ncpus * 2); } #endif infocnt = (ncpus == 0) ? 1 : ncpus; ESL_ALLOC(info, sizeof(*info) * infocnt); for (i = 0; i < infocnt; ++i) { info[i].bg = p7_bg_Create(cfg->abc); info[i].bld = p7_builder_Create(go, cfg->abc); if (info[i].bld == NULL) p7_Fail("p7_builder_Create failed"); #ifdef HMMER_THREADS info[i].queue = queue; if (ncpus > 0) esl_threads_AddThread(threadObj, &info[i]); #endif } #ifdef HMMER_THREADS for (i = 0; i < ncpus * 2; ++i) { ESL_ALLOC(item, sizeof(*item)); item->nali = 0; item->processed = FALSE; item->postmsa = NULL; item->msa = NULL; item->hmm = NULL; item->entropy = 0.0; status = esl_workqueue_Init(queue, item); if (status != eslOK) esl_fatal("Failed to add block to work queue"); } #endif #ifdef HMMER_THREADS if (ncpus > 0) status = thread_loop(threadObj, queue, cfg); else status = serial_loop(info, cfg); #else status = serial_loop(info, cfg); #endif if (status == eslEFORMAT) esl_fatal("Alignment file parse error:\n%s\n", cfg->afp->errbuf); else if (status == eslEINVAL) esl_fatal("Alignment file parse error:\n%s\n", cfg->afp->errbuf); else if (status != eslEOF) esl_fatal("Alignment file read failed with error code %d\n", status); for (i = 0; i < infocnt; ++i) { p7_bg_Destroy(info[i].bg); p7_builder_Destroy(info[i].bld); } #ifdef HMMER_THREADS if (ncpus > 0) { esl_workqueue_Reset(queue); while (esl_workqueue_Remove(queue, (void **) &item) == eslOK) { free(item); } esl_workqueue_Destroy(queue); esl_threads_Destroy(threadObj); } #endif free(info); return eslOK; ERROR: return eslFAIL; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* application configuration */ char *seqfile = NULL; /* sequence file name */ char *maskfile = NULL; /* mask coordinate file name */ int infmt = eslSQFILE_UNKNOWN; /* format code for seqfile */ int outfmt = eslSQFILE_FASTA; /* format code for output seqs */ ESL_SQFILE *sqfp = NULL; /* open sequence file */ ESL_FILEPARSER *maskefp = NULL; /* open mask coord file */ FILE *ofp = NULL; /* output stream for masked seqs */ char *source = NULL; /* name of current seq to mask */ char *p1, *p2; /* pointers used in parsing */ int64_t start, end; /* start, end coord for masking */ int64_t i, j, pos; /* coords in a sequence */ int64_t overmask; /* # of extra residues to mask */ ESL_SQ *sq = esl_sq_Create(); /* current sequence */ int do_fetching; int do_lowercase; int maskchar; int status; /* easel return code */ /**************************************************************************** * Parse command line ****************************************************************************/ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) cmdline_failure(argv[0], "Error in configuration: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h") ) cmdline_help (argv[0], go); if (esl_opt_ArgNumber(go) != 2) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n"); do_fetching = esl_opt_GetBoolean(go, "-R"); do_lowercase = esl_opt_GetBoolean(go, "-l"); overmask = (esl_opt_IsOn(go, "-x") ? esl_opt_GetInteger(go, "-x") : 0); maskchar = (esl_opt_IsOn(go, "-m") ? esl_opt_GetChar(go, "-m") : 'X'); seqfile = esl_opt_GetArg(go, 1); maskfile = esl_opt_GetArg(go, 2); /* Open the <seqfile>: text mode, not digital */ if (esl_opt_GetString(go, "--informat") != NULL) { infmt = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--informat")); if (infmt == eslSQFILE_UNKNOWN) cmdline_failure(argv[0], "%s is not a valid input sequence file format for --informat"); } status = esl_sqfile_Open(seqfile, infmt, NULL, &sqfp); if (status == eslENOTFOUND) cmdline_failure(argv[0], "Sequence file %s not found.\n", seqfile); else if (status == eslEFORMAT) cmdline_failure(argv[0], "Format of file %s unrecognized.\n", seqfile); else if (status == eslEINVAL) cmdline_failure(argv[0], "Can't autodetect stdin or .gz.\n"); else if (status != eslOK) cmdline_failure(argv[0], "Open failed, code %d.\n", status); if(do_fetching) { status = esl_sqfile_OpenSSI(sqfp, NULL); if (status == eslEFORMAT) cmdline_failure(argv[0], "SSI index is in incorrect format\n"); else if (status == eslERANGE) cmdline_failure(argv[0], "SSI index is in 64-bit format and we can't read it\n"); else if (status != eslOK) cmdline_failure(argv[0], "Failed to open SSI index\n"); } /* Open the <maskfile> */ if (esl_fileparser_Open(maskfile, NULL, &maskefp) != eslOK) cmdline_failure(argv[0], "Failed to open mask coordinate file %s\n", maskfile); esl_fileparser_SetCommentChar(maskefp, '#'); /* Open the output file, if any */ if (esl_opt_GetString(go, "-o") != NULL) { if ((ofp = fopen(esl_opt_GetString(go, "-o"), "w")) == NULL) cmdline_failure(argv[0], "Failed to open output file %s\n", esl_opt_GetString(go, "-o")); } else ofp = stdout; /**************************************************************************** * Main loop over lines in <maskfile> ****************************************************************************/ /* Read one data line at a time from the <maskfile>; * parse into data fields <seqname> <start> <end> */ while (esl_fileparser_NextLine(maskefp) == eslOK) { /* First field is sequence name */ if (esl_fileparser_GetTokenOnLine(maskefp, &source, NULL) != eslOK) esl_fatal("Failed to read source seq name on line %d of file %s\n", maskefp->linenumber, maskfile); /* Get the sequence */ if (do_fetching) { /* If the <seqfile> is SSI indexed, try to reposition it and read <source> seq by random access */ status = esl_sqio_Fetch(sqfp, source, sq); if (status == eslENOTFOUND) esl_fatal("seq %s not found in SSI index for file %s\n", source, sqfp->filename); else if (status == eslEINVAL) esl_fatal("No SSI index or can't reposition in file %s\n", sqfp->filename); else if (status == eslEFORMAT) esl_fatal("Parse failed:\n%s\n", esl_sqfile_GetErrorBuf(sqfp)); else if (status != eslOK) esl_fatal("Unexpected failure in fetching %s from file %s\n", source, sqfp->filename); } else { /* else, assume we're reading sequentially; <sqfile> and <maskfile> have seqs in same order */ status = esl_sqio_Read(sqfp, sq); if (status == eslEOF) esl_fatal("File %s ended prematurely; didn't find %s\n", sqfp->filename, source); else if (status == eslEFORMAT) esl_fatal("Parse failed:\n%s\n", esl_sqfile_GetErrorBuf(sqfp)); else if (status != eslOK) esl_fatal("Unexpected error reading sequence file %s\n", sqfp->filename); if ((strcmp(sq->name, source) != 0) && (strcmp(sq->acc, source) != 0)) esl_fatal("Sequences in <sqfile> and <maskfile> aren't in same order; try -R"); } /* If we're masking by lowercase, first make sure everything's uppercase */ if (do_lowercase) for (pos = 0; pos < sq->n; pos++) if (isalpha(sq->seq[pos])) sq->seq[pos] = toupper(sq->seq[pos]); /* Next two fields are <start>, <end> for the masking */ /* possible future extension: wrap loop around this, enable multiple masked regions */ if (esl_fileparser_GetTokenOnLine(maskefp, &p1, NULL) != eslOK) esl_fatal("Failed to read start coord on line %d of file %s\n", maskefp->linenumber, maskfile); start = strtoll(p1, &p2, 0) - 1; if (esl_fileparser_GetTokenOnLine(maskefp, &p2, NULL) != eslOK) esl_fatal("Failed to read end coord on line %d of file %s\n", maskefp->linenumber, maskfile); end = strtoll(p2, &p1, 0) - 1; /* Do the masking */ if (esl_opt_GetBoolean(go, "-r")) /* Reverse masking */ { /* leave start..end unmasked; mask prefix 0..start-1, end+1..L-1 */ i = 0; j = ESL_MIN(sq->n-1, start - 1 + overmask); for (pos = i; pos <= j; pos++) if (isalpha(sq->seq[pos])) sq->seq[pos] = (do_lowercase ? tolower(sq->seq[pos]) : maskchar); i = ESL_MAX(0, end + 1 - overmask); j = sq->n-1; for (pos = i; pos <= j; pos++) if (isalpha(sq->seq[pos])) sq->seq[pos] = (do_lowercase ? tolower(sq->seq[pos]) : maskchar); } else { /* normal: mask start..end */ i = ESL_MAX(0, start - overmask); j = ESL_MIN(sq->n-1, end + overmask); for (pos = i; pos <= j; pos++) if (isalpha(sq->seq[pos])) sq->seq[pos] = (do_lowercase ? tolower(sq->seq[pos]) : maskchar); } esl_sqio_Write(ofp, sq, outfmt, FALSE); esl_sq_Reuse(sq); } esl_sq_Destroy(sq); esl_fileparser_Close(maskefp); esl_sqfile_Close(sqfp); esl_getopts_Destroy(go); if (ofp != stdout) fclose(ofp); return 0; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* command line processing */ ESL_ALPHABET *abc = NULL; char *hmmfile = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; int nhmm; double x; float KL; int status; char errbuf[eslERRBUFSIZE]; float nseq; int do_eval2score = 0; int do_score2eval = 0; int z_val; float e_val; float s_val; /* Process the command line options. */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK || esl_opt_VerifyConfig(go) != eslOK) { printf("Failed to parse command line: %s\n", go->errbuf); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if (esl_opt_GetBoolean(go, "-h") == TRUE) { p7_banner(stdout, argv[0], banner); esl_usage(stdout, argv[0], usage); puts("\nOptions:"); esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=docgroup, 2 = indentation; 80=textwidth*/ exit(0); } if (esl_opt_ArgNumber(go) != 1) { puts("Incorrect number of command line arguments."); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if ((hmmfile = esl_opt_GetArg(go, 1)) == NULL) { puts("Failed to read <hmmfile> argument from command line."); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } output_header(stdout, go); if ( esl_opt_IsOn(go, "--eval2score") ) { do_eval2score = TRUE; e_val = esl_opt_GetReal(go, "-E"); } else if ( esl_opt_IsOn(go, "--score2eval") ) { do_score2eval = TRUE; s_val = esl_opt_GetReal(go, "-S"); } else if ( esl_opt_IsUsed(go, "--baseZ") || esl_opt_IsUsed(go, "--baseZ1") || esl_opt_IsUsed(go, "-Z") ) { puts("The flags -Z, --baseZ, and --baseZ1 are for use with --eval2score and --score2eval."); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if (esl_opt_IsUsed(go, "--baseZ") ) { z_val = 1000000 * 2 * (long)(esl_opt_GetInteger(go, "--baseZ")); } else if (esl_opt_IsUsed(go, "--baseZ1") ) { z_val = 1000000 * (long)(esl_opt_GetInteger(go, "--baseZ1")); } else { z_val = esl_opt_GetInteger(go, "-Z"); } /* Initializations: open the HMM file */ status = p7_hmmfile_OpenE(hmmfile, NULL, &hfp, errbuf); if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status == eslEFORMAT) p7_Fail("File format problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, hmmfile, errbuf); /* Main body: read HMMs one at a time, print one line of stats */ printf("#\n"); printf("# %-4s %-20s %-12s %8s %8s %6s %6s %6s %6s %6s", "idx", "name", "accession", "nseq", "eff_nseq", "M", "relent", "info", "p relE", "compKL"); if (do_eval2score) printf (" %6s %6.2g", "sc for", e_val); if (do_score2eval) printf (" %6s %6.2f", "E-val for", s_val); printf("\n"); printf("# %-4s %-20s %-12s %8s %8s %6s %6s %6s %6s %6s", "----", "--------------------", "------------", "--------", "--------", "------", "------", "------", "------", "------"); if (do_eval2score) printf (" %13s", "-------------"); if (do_score2eval) printf (" %13s", "-------------"); printf("\n"); nhmm = 0; while ((status = p7_hmmfile_Read(hfp, &abc, &hmm)) != eslEOF) { if (status == eslEOD) esl_fatal("read failed, HMM file %s may be truncated?", hmmfile); else if (status == eslEFORMAT) esl_fatal("bad file format in HMM file %s", hmmfile); else if (status == eslEINCOMPAT) esl_fatal("HMM file %s contains different alphabets", hmmfile); else if (status != eslOK) esl_fatal("Unexpected error in reading HMMs from %s", hmmfile); nhmm++; if ( esl_opt_IsOn(go, "--eval2score") || esl_opt_IsOn(go, "--score2eval") ) { if (esl_opt_IsUsed(go, "--baseZ") || esl_opt_IsUsed(go, "--baseZ1" ) ) { if ( hmm->abc->type != eslRNA && hmm->abc->type != eslDNA) { puts("The flags --baseZ and --baseZ1 can't be used with non-nucleotide models."); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } } else if ( hmm->abc->type != eslAMINO && hmm->abc->type != eslRNA && hmm->abc->type != eslDNA) { puts("The flags --eval2score and --score2eval can't be used with non-sequence models."); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } } if (esl_opt_IsUsed(go, "--baseZ") ) { nseq = (float)z_val / (float)(hmm->max_length); } else if (esl_opt_IsUsed(go, "--baseZ1") ) { nseq = (float)z_val / (float)(hmm->max_length); } else { nseq = z_val; } if (bg == NULL) bg = p7_bg_Create(abc); p7_MeanPositionRelativeEntropy(hmm, bg, &x); p7_hmm_CompositionKLDist(hmm, bg, &KL, NULL); printf("%-6d %-20s %-12s %8d %8.2f %6d %6.2f %6.2f %6.2f %6.2f", nhmm, hmm->name, hmm->acc == NULL ? "-" : hmm->acc, hmm->nseq, hmm->eff_nseq, hmm->M, p7_MeanMatchRelativeEntropy(hmm, bg), p7_MeanMatchInfo(hmm, bg), x, KL); if ( do_eval2score ) { float sc; sc = esl_exp_invsurv( e_val / nseq , hmm->evparam[p7_FTAU], hmm->evparam[p7_FLAMBDA]); printf("%13.2f", sc); } else if ( do_score2eval) { float e; e = nseq * esl_exp_surv( s_val , hmm->evparam[p7_FTAU], hmm->evparam[p7_FLAMBDA]); printf("%13.2g", e); } printf("\n"); /* p7_MeanForwardScore(hmm, bg)); */ p7_hmm_Destroy(hmm); } p7_bg_Destroy(bg); esl_alphabet_Destroy(abc); p7_hmmfile_Close(hfp); esl_getopts_Destroy(go); exit(0); }
/* serial_master() * The serial version of hmmsearch. * For each query HMM in <hmmdb> search the database for hits. * * A master can only return if it's successful. All errors are handled * immediately and fatally with p7_Fail(). */ static int serial_master(ESL_GETOPTS *go, struct cfg_s *cfg) { FILE *ofp = stdout; /* output file for results (default stdout) */ FILE *tblfp = NULL; /* output stream for tabular per-seq (--tblout) */ FILE *dfamtblfp = NULL; /* output stream for tabular Dfam format (--dfamtblout) */ FILE *aliscoresfp = NULL; /* output stream for alignment scores (--aliscoresout) */ // P7_HMM *hmm = NULL; /* one HMM query */ // P7_SCOREDATA *scoredata = NULL; int seqfmt = eslSQFILE_UNKNOWN; /* format of seqfile */ ESL_SQFILE *sqfp = NULL; /* open seqfile */ P7_HMMFILE *hfp = NULL; /* open HMM database file */ ESL_ALPHABET *abc = NULL; /* sequence alphabet */ P7_OPROFILE *om = NULL; /* target profile */ ESL_STOPWATCH *w = NULL; /* timing */ ESL_SQ *qsq = NULL; /* query sequence */ int nquery = 0; int textw; int status = eslOK; int hstatus = eslOK; int sstatus = eslOK; int i; int ncpus = 0; int infocnt = 0; WORKER_INFO *info = NULL; #ifdef HMMER_THREADS P7_OM_BLOCK *block = NULL; ESL_THREADS *threadObj= NULL; ESL_WORK_QUEUE *queue = NULL; #endif char errbuf[eslERRBUFSIZE]; double window_beta = -1.0 ; int window_length = -1; if (esl_opt_IsUsed(go, "--w_beta")) { if ( ( window_beta = esl_opt_GetReal(go, "--w_beta") ) < 0 || window_beta > 1 ) esl_fatal("Invalid window-length beta value\n"); } if (esl_opt_IsUsed(go, "--w_length")) { if (( window_length = esl_opt_GetInteger(go, "--w_length")) < 4 ) esl_fatal("Invalid window length value\n"); } w = esl_stopwatch_Create(); if (esl_opt_GetBoolean(go, "--notextw")) textw = 0; else textw = esl_opt_GetInteger(go, "--textw"); /* If caller declared an input format, decode it */ if (esl_opt_IsOn(go, "--qformat")) { seqfmt = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--qformat")); if (seqfmt == eslSQFILE_UNKNOWN) p7_Fail("%s is not a recognized input sequence file format\n", esl_opt_GetString(go, "--qformat")); } /* validate options if running as a daemon */ // if (esl_opt_IsOn(go, "--daemon")) { /* running as a daemon, the input format must be type daemon */ // if (seqfmt != eslSQFILE_UNKNOWN && seqfmt != eslSQFILE_DAEMON) // esl_fatal("Input format %s not supported. Must be daemon\n", esl_opt_GetString(go, "--qformat")); // seqfmt = eslSQFILE_DAEMON; // if (strcmp(cfg->seqfile, "-") != 0) esl_fatal("Query sequence file must be '-'\n"); // } /* Open the target profile database to get the sequence alphabet */ status = p7_hmmfile_OpenE(cfg->hmmfile, p7_HMMDBENV, &hfp, errbuf); if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", cfg->hmmfile, errbuf); else if (status == eslEFORMAT) p7_Fail("File format problem, trying to open HMM file %s.\n%s\n", cfg->hmmfile, errbuf); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, cfg->hmmfile, errbuf); if (! hfp->is_pressed) p7_Fail("Failed to open binary auxfiles for %s: use hmmpress first\n", hfp->fname); hstatus = p7_oprofile_ReadMSV(hfp, &abc, &om); if (hstatus == eslEFORMAT) p7_Fail("bad format, binary auxfiles, %s:\n%s", cfg->hmmfile, hfp->errbuf); else if (hstatus == eslEINCOMPAT) p7_Fail("HMM file %s contains different alphabets", cfg->hmmfile); else if (hstatus != eslOK) p7_Fail("Unexpected error in reading HMMs from %s", cfg->hmmfile); p7_oprofile_Destroy(om); p7_hmmfile_Close(hfp); /* Open the query sequence database */ status = esl_sqfile_OpenDigital(abc, cfg->seqfile, seqfmt, NULL, &sqfp); if (status == eslENOTFOUND) p7_Fail("Failed to open sequence file %s for reading\n", cfg->seqfile); else if (status == eslEFORMAT) p7_Fail("Sequence file %s is empty or misformatted\n", cfg->seqfile); else if (status == eslEINVAL) p7_Fail("Can't autodetect format of a stdin or .gz seqfile"); else if (status != eslOK) p7_Fail("Unexpected error %d opening sequence file %s\n", status, cfg->seqfile); if (sqfp->format > 100) // breaking the law! That range is reserved for msa, for aligned formats p7_Fail("%s contains a multiple sequence alignment; expect unaligned sequences, like FASTA\n", cfg->seqfile); qsq = esl_sq_CreateDigital(abc); /* Open the results output files */ if (esl_opt_IsOn(go, "-o")) { if ((ofp = fopen(esl_opt_GetString(go, "-o"), "w")) == NULL) esl_fatal("Failed to open output file %s for writing\n", esl_opt_GetString(go, "-o")); } if (esl_opt_IsOn(go, "--tblout")) { if ((tblfp = fopen(esl_opt_GetString(go, "--tblout"), "w")) == NULL) esl_fatal("Failed to open tabular per-seq output file %s for writing\n", esl_opt_GetString(go, "--tblfp")); } if (esl_opt_IsOn(go, "--dfamtblout")) { if ((dfamtblfp = fopen(esl_opt_GetString(go, "--dfamtblout"),"w")) == NULL) esl_fatal("Failed to open tabular dfam output file %s for writing\n", esl_opt_GetString(go, "--dfamtblout")); } if (esl_opt_IsOn(go, "--aliscoresout")) { if ((aliscoresfp = fopen(esl_opt_GetString(go, "--aliscoresout"),"w")) == NULL) esl_fatal("Failed to open alignment scores output file %s for writing\n", esl_opt_GetString(go, "--aliscoresout")); } output_header(ofp, go, cfg->hmmfile, cfg->seqfile); #ifdef HMMER_THREADS /* initialize thread data */ if (esl_opt_IsOn(go, "--cpu")) ncpus = esl_opt_GetInteger(go, "--cpu"); else esl_threads_CPUCount(&ncpus); if (ncpus > 0) { threadObj = esl_threads_Create(&pipeline_thread); queue = esl_workqueue_Create(ncpus * 2); } #endif infocnt = (ncpus == 0) ? 1 : ncpus; ESL_ALLOC(info, sizeof(*info) * infocnt); for (i = 0; i < infocnt; ++i) { info[i].bg = p7_bg_Create(abc); #ifdef HMMER_THREADS info[i].queue = queue; #endif } #ifdef HMMER_THREADS for (i = 0; i < ncpus * 2; ++i) { block = p7_oprofile_CreateBlock(BLOCK_SIZE); if (block == NULL) esl_fatal("Failed to allocate sequence block"); status = esl_workqueue_Init(queue, block); if (status != eslOK) esl_fatal("Failed to add block to work queue"); } #endif /* Outside loop: over each query sequence in <seqfile>. */ while ((sstatus = esl_sqio_Read(sqfp, qsq)) == eslOK) { if (sstatus == eslEMEM) p7_Fail("Memory allocation error reading sequence file\n", status); if (sstatus == eslEINCONCEIVABLE) p7_Fail("Unexpected error %d reading sequence file\n", status); // if (qsq->L > NHMMER_MAX_RESIDUE_COUNT) p7_Fail("Input sequence %s in file %s exceeds maximum length of %d bases.\n", qsq->name, cfg->seqfile, NHMMER_MAX_RESIDUE_COUNT); nquery++; esl_stopwatch_Start(w); /* Open the target profile database */ status = p7_hmmfile_OpenE(cfg->hmmfile, p7_HMMDBENV, &hfp, NULL); if (status != eslOK) p7_Fail("Unexpected error %d in opening hmm file %s.\n", status, cfg->hmmfile); #ifdef HMMER_THREADS /* if we are threaded, create a lock to prevent multiple readers */ if (ncpus > 0) { status = p7_hmmfile_CreateLock(hfp); if (status != eslOK) p7_Fail("Unexpected error %d creating lock\n", status); } #endif if (fprintf(ofp, "Query: %s [L=%ld]\n", qsq->name, (long) qsq->n) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (qsq->acc[0] != 0 && fprintf(ofp, "Accession: %s\n", qsq->acc) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (qsq->desc[0] != 0 && fprintf(ofp, "Description: %s\n", qsq->desc) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); for (i = 0; i < infocnt; ++i) { /* Create processing pipeline and hit list */ info[i].th = p7_tophits_Create(); info[i].pli = p7_pipeline_Create(go, 100, 100, TRUE, p7_SCAN_MODELS); /* M_hint = 100, L_hint = 100 are just dummies for now */ info[i].pli->hfp = hfp; /* for two-stage input, pipeline needs <hfp> */ p7_pli_NewSeq(info[i].pli, qsq); info[i].qsq = qsq; if ( esl_opt_IsUsed(go, "--toponly") ) info[i].pli->strand = p7_STRAND_TOPONLY; else if ( esl_opt_IsUsed(go, "--bottomonly") ) info[i].pli->strand = p7_STRAND_BOTTOMONLY; else info[i].pli->strand = p7_STRAND_BOTH; #ifdef HMMER_THREADS if (ncpus > 0) esl_threads_AddThread(threadObj, &info[i]); #endif } #ifdef HMMER_THREADS if (ncpus > 0) hstatus = thread_loop(threadObj, queue, hfp); else hstatus = serial_loop(info, hfp); #else hstatus = serial_loop(info, hfp); #endif switch(hstatus) { case eslEFORMAT: p7_Fail("bad file format in HMM file %s", cfg->hmmfile); break; case eslEINCOMPAT: p7_Fail("HMM file %s contains different alphabets", cfg->hmmfile); break; case eslEOF: case eslOK: /* do nothing */ break; default: p7_Fail("Unexpected error in reading HMMs from %s", cfg->hmmfile); } /* merge the results of the search results */ for (i = 1; i < infocnt; ++i) { p7_tophits_Merge(info[0].th, info[i].th); p7_pipeline_Merge(info[0].pli, info[i].pli); p7_pipeline_Destroy(info[i].pli); p7_tophits_Destroy(info[i].th); } /* modify e-value to account for number of models */ for (i = 0; i < info->th->N ; i++) { info->th->unsrt[i].lnP += log((float)info->pli->nmodels); info->th->unsrt[i].dcl[0].lnP = info->th->unsrt[i].lnP; info->th->unsrt[i].sortkey = -1.0 * info->th->unsrt[i].lnP; } /* it's possible to have duplicates based on how viterbi ranges can overlap */ p7_tophits_SortByModelnameAndAlipos(info->th); p7_tophits_RemoveDuplicates(info->th, info->pli->use_bit_cutoffs); /* Print results */ p7_tophits_SortBySortkey(info->th); p7_tophits_Threshold(info->th, info->pli); //tally up total number of hits and target coverage info->pli->n_output = info->pli->pos_output = 0; for (i = 0; i < info->th->N; i++) { if ( (info->th->hit[i]->flags & p7_IS_REPORTED) || info->th->hit[i]->flags & p7_IS_INCLUDED) { info->pli->n_output++; info->pli->pos_output += abs(info->th->hit[i]->dcl[0].jali - info->th->hit[i]->dcl[0].iali) + 1; } } p7_tophits_Targets(ofp, info->th, info->pli, textw); if (fprintf(ofp, "\n\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); p7_tophits_Domains(ofp, info->th, info->pli, textw); if (fprintf(ofp, "\n\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (tblfp) p7_tophits_TabularTargets(tblfp, qsq->name, qsq->acc, info->th, info->pli, (nquery == 1)); if (dfamtblfp) p7_tophits_TabularXfam(dfamtblfp, qsq->name, NULL, info->th, info->pli); if (aliscoresfp) p7_tophits_AliScores(aliscoresfp, qsq->name, info->th ); esl_stopwatch_Stop(w); info->pli->nseqs = 1; p7_pli_Statistics(ofp, info->pli, w); if (fprintf(ofp, "//\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); fflush(ofp); p7_hmmfile_Close(hfp); p7_pipeline_Destroy(info->pli); p7_tophits_Destroy(info->th); esl_sq_Reuse(qsq); } if (sstatus == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n", sqfp->filename, esl_sqfile_GetErrorBuf(sqfp)); else if (sstatus != eslEOF) esl_fatal("Unexpected error %d reading sequence file %s", sstatus, sqfp->filename); /* Terminate outputs - any last words? */ if (tblfp) p7_tophits_TabularTail(tblfp, "hmmscan", p7_SCAN_MODELS, cfg->seqfile, cfg->hmmfile, go); if (ofp) { if (fprintf(ofp, "[ok]\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); } /* Cleanup - prepare for successful exit */ for (i = 0; i < infocnt; ++i) p7_bg_Destroy(info[i].bg); #ifdef HMMER_THREADS if (ncpus > 0) { esl_workqueue_Reset(queue); while (esl_workqueue_Remove(queue, (void **) &block) == eslOK) p7_oprofile_DestroyBlock(block); esl_workqueue_Destroy(queue); esl_threads_Destroy(threadObj); } #endif free(info); esl_sq_Destroy(qsq); esl_stopwatch_Destroy(w); esl_alphabet_Destroy(abc); esl_sqfile_Close(sqfp); if (ofp != stdout) fclose(ofp); if (tblfp) fclose(tblfp); if (dfamtblfp) fclose(dfamtblfp); if (aliscoresfp) fclose(aliscoresfp); return eslOK; ERROR: if (ofp != stdout) fclose(ofp); if (tblfp) fclose(tblfp); if (dfamtblfp) fclose(dfamtblfp); if (aliscoresfp) fclose(aliscoresfp); return status; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* command line processing */ ESL_ALPHABET *abc = NULL; /* sequence alphabet */ ESL_RANDOMNESS *r = NULL; /* source of randomness */ char *hmmfile = NULL; /* file to read HMM(s) from */ P7_HMMFILE *hfp = NULL; /* open hmmfile */ P7_HMM *hmm = NULL; /* HMM to emit from */ FILE *ofp = NULL; /* output stream */ int outfmt = 0; int nhmms = 0; int status; char errbuf[eslERRBUFSIZE]; go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) cmdline_failure(argv[0], "Error in configuration: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h")) cmdline_help (argv[0], go); if (esl_opt_ArgNumber(go) != 1) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n"); if ((hmmfile = esl_opt_GetArg(go, 1)) == NULL) cmdline_failure(argv[0], "Failed to get <hmmfile> on cmdline: %s\n", go->errbuf); if ( esl_opt_IsOn(go, "-o") ) { if ((ofp = fopen(esl_opt_GetString(go, "-o"), "w")) == NULL) esl_fatal("Failed to open output file %s", esl_opt_GetString(go, "-o")); } else ofp = stdout; if (esl_opt_GetBoolean(go, "-a")) outfmt = eslMSAFILE_STOCKHOLM; else outfmt = eslSQFILE_FASTA; r = esl_randomness_CreateFast(esl_opt_GetInteger(go, "--seed")); status = p7_hmmfile_OpenE(hmmfile, NULL, &hfp, errbuf); if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status == eslEFORMAT) p7_Fail("File format problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, hmmfile, errbuf); while ((status = p7_hmmfile_Read(hfp, &abc, &hmm)) != eslEOF) { if (status == eslEFORMAT) esl_fatal("Bad file format in HMM file %s:\n%s\n", hfp->fname, hfp->errbuf); else if (status == eslEINCOMPAT) esl_fatal("HMM in %s is not in the expected %s alphabet\n", hfp->fname, esl_abc_DecodeType(abc->type)); else if (status != eslOK) esl_fatal("Unexpected error in reading HMMs from %s\n", hfp->fname); nhmms++; if (esl_opt_GetBoolean(go, "-c")) emit_consensus(go, ofp, outfmt, hmm); else if (esl_opt_GetBoolean(go, "-C")) emit_fancycons(go, ofp, outfmt, hmm); else if (esl_opt_GetBoolean(go, "-a")) emit_alignment(go, ofp, outfmt, r, hmm); else emit_sequences(go, ofp, outfmt, r, hmm); p7_hmm_Destroy(hmm); } if (nhmms == 0) esl_fatal("Empty HMM file %s? No HMM data found.\n"); if (esl_opt_IsOn(go, "-o")) { fclose(ofp); } esl_randomness_Destroy(r); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); p7_hmmfile_Close(hfp); return eslOK; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* application configuration */ ESL_ALPHABET *abc = NULL; /* biological alphabet */ char *alifile1= NULL; /* alignment 1 file name */ char *alifile2= NULL; /* alignment 2 file name */ int fmt; /* format code for alifiles */ ESLX_MSAFILE *afp1 = NULL; /* open alignment file 1 */ ESLX_MSAFILE *afp2 = NULL; /* open alignment file 2 */ ESL_MSA *msa1 = NULL; /* multiple sequence alignment 1 */ ESL_MSA *msa2 = NULL; /* multiple sequence alignment 2 */ int status; /* easel return code */ char errbuf[eslERRBUFSIZE*4]; int *msa1_to_msa2_map; /* map from <msafile1> to <msafile2> */ char *sub_msa1_to_msa2_mask; /* with --sub the map from <msafile1> to <msafile2> in mask form */ FILE *subfp = NULL; /*********************************************** * Parse command line ***********************************************/ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK || esl_opt_VerifyConfig(go) != eslOK) { printf("Failed to parse command line: %s\n", go->errbuf); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if (esl_opt_GetBoolean(go, "-h") ) { esl_banner(stdout, argv[0], banner); esl_usage (stdout, argv[0], usage); puts("\nwhere basic options are:"); esl_opt_DisplayHelp(stdout, go, 1, 2, 80); exit(0); } if (esl_opt_ArgNumber(go) != 2) { printf("Incorrect number of command line arguments.\n"); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } alifile1 = esl_opt_GetArg(go, 1); alifile2 = esl_opt_GetArg(go, 2); fmt = eslMSAFILE_STOCKHOLM; /*********************************************** * Open the MSA files ***********************************************/ if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO); else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA); if ( (status = eslx_msafile_Open(&abc, alifile1, NULL, fmt, NULL, &afp1)) != eslOK) eslx_msafile_OpenFailure(afp1, status); if ( (status = eslx_msafile_Open(&abc, alifile2, NULL, fmt, NULL, &afp2)) != eslOK) eslx_msafile_OpenFailure(afp2, status); /****************************************************************** * Read first alignment from each file, we only use the first one ******************************************************************/ if ((status = eslx_msafile_Read(afp1, &msa1)) != eslOK) eslx_msafile_ReadFailure(afp1, status); if ((status = eslx_msafile_Read(afp2, &msa2)) != eslOK) eslx_msafile_ReadFailure(afp2, status); /* map the alignments in msa1 and msa2 */ if(! esl_opt_IsOn(go, "--submap")) { if((status = map_msas(go, errbuf, msa1, msa2, &msa1_to_msa2_map)) != eslOK) goto ERROR; free(msa1_to_msa2_map); } /* --submap: if nec, map <msafile1> to a subset of it's own columns in <msafile2> */ else { /* --submap was enabled */ if ((subfp = fopen(esl_opt_GetString(go, "--submap"), "w")) == NULL) ESL_FAIL(eslFAIL, errbuf, "Failed to open --submap output file %s\n", esl_opt_GetString(go, "--submap")); if((status = map_sub_msas(go, errbuf, msa1, msa2, &sub_msa1_to_msa2_mask)) != eslOK) goto ERROR; fprintf(subfp, "%s\n", sub_msa1_to_msa2_mask); fclose(subfp); subfp = NULL; printf("# Mask of 1/0s with 1 indicating aln column in %s maps to a column in %s saved to file %s.\n", alifile1, alifile2, esl_opt_GetString(go, "--submap")); free(sub_msa1_to_msa2_mask); } /* Cleanup, normal return */ eslx_msafile_Close(afp1); eslx_msafile_Close(afp2); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); esl_msa_Destroy(msa1); esl_msa_Destroy(msa2); return 0; ERROR: if (afp1) eslx_msafile_Close(afp1); if (afp2) eslx_msafile_Close(afp2); if (go) esl_getopts_Destroy(go); if (msa1) esl_msa_Destroy(msa1); if (msa2) esl_msa_Destroy(msa2); if (subfp) fclose(subfp); esl_fatal(errbuf); return 1; /* never reached */ }
/* serial_master() * The serial version of hmmsearch. * For each query HMM in <hmmfile> search the database for hits. * * A master can only return if it's successful. All errors are handled * immediately and fatally with p7_Fail(). We also use the * ESL_EXCEPTION and ERROR: mechanisms, but only because we know we're * using a fatal exception handler. */ static int serial_master(ESL_GETOPTS *go, struct cfg_s *cfg) { FILE *ofp = stdout; /* results output file (-o) */ P7_HMMFILE *hfp = NULL; /* open input HMM file */ ESL_SQFILE *dbfp = NULL; /* open input sequence file */ P7_HMM *hmm = NULL; /* one HMM query */ ESL_ALPHABET *abc = NULL; /* digital alphabet */ int dbfmt = eslSQFILE_UNKNOWN; /* format code for sequence database file */ ESL_STOPWATCH *w; int textw = 0; int nquery = 0; int status = eslOK; int hstatus = eslOK; int sstatus = eslOK; int i; int ncpus = 0; int infocnt = 0; WORKER_INFO *info = NULL; char errbuf[eslERRBUFSIZE]; w = esl_stopwatch_Create(); if (esl_opt_GetBoolean(go, "--notextw")) textw = 0; else textw = esl_opt_GetInteger(go, "--textw"); if (esl_opt_IsOn(go, "--tformat")) { dbfmt = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--tformat")); if (dbfmt == eslSQFILE_UNKNOWN) p7_Fail("%s is not a recognized sequence database file format\n", esl_opt_GetString(go, "--tformat")); } /* Open the target sequence database */ status = esl_sqfile_Open(cfg->dbfile, dbfmt, p7_SEQDBENV, &dbfp); if (status == eslENOTFOUND) p7_Fail("Failed to open sequence file %s for reading\n", cfg->dbfile); else if (status == eslEFORMAT) p7_Fail("Sequence file %s is empty or misformatted\n", cfg->dbfile); else if (status == eslEINVAL) p7_Fail("Can't autodetect format of a stdin or .gz seqfile"); else if (status != eslOK) p7_Fail("Unexpected error %d opening sequence file %s\n", status, cfg->dbfile); if (esl_opt_IsUsed(go, "--restrictdb_stkey") || esl_opt_IsUsed(go, "--restrictdb_n")) { if (esl_opt_IsUsed(go, "--ssifile")) esl_sqfile_OpenSSI(dbfp, esl_opt_GetString(go, "--ssifile")); else esl_sqfile_OpenSSI(dbfp, NULL); } /* Open the query profile HMM file */ status = p7_hmmfile_OpenE(cfg->hmmfile, NULL, &hfp, errbuf); if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", cfg->hmmfile, errbuf); else if (status == eslEFORMAT) p7_Fail("File format problem in trying to open HMM file %s.\n%s\n", cfg->hmmfile, errbuf); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, cfg->hmmfile, errbuf); /* Open the results output files */ if (esl_opt_IsOn(go, "-o")) { if ((ofp = fopen(esl_opt_GetString(go, "-o"), "w+")) == NULL) p7_Fail("Failed to open output file %s for writing\n", esl_opt_GetString(go, "-o")); } infocnt = 1; ESL_ALLOC(info, sizeof(*info) * infocnt); /* <abc> is not known 'til first HMM is read. */ hstatus = p7_hmmfile_Read(hfp, &abc, &hmm); if (hstatus == eslOK) { /* One-time initializations after alphabet <abc> becomes known */ // output_header(ofp, go, cfg->hmmfile, cfg->dbfile); // dbfp->abc = abc; //ReadBlock requires knowledge of the alphabet to decide how best to read blocks // for (i = 0; i < infocnt; ++i) // { // info[i].bg = p7_bg_Create(abc); // } } /* Outer loop: over each query HMM in <hmmfile>. */ // while (hstatus == eslOK) // { P7_PROFILE *gm = NULL; P7_OPROFILE *om = NULL; /* optimized query profile */ nquery++; esl_stopwatch_Start(w); /* seqfile may need to be rewound (multiquery mode) */ if (nquery > 1) { if (! esl_sqfile_IsRewindable(dbfp)) esl_fatal("Target sequence file %s isn't rewindable; can't search it with multiple queries", cfg->dbfile); if (! esl_opt_IsUsed(go, "--restrictdb_stkey") ) esl_sqfile_Position(dbfp, 0); //only re-set current position to 0 if we're not planning to set it in a moment } if ( cfg->firstseq_key != NULL ) { //it's tempting to want to do this once and capture the offset position for future passes, but ncbi files make this non-trivial, so this keeps it general sstatus = esl_sqfile_PositionByKey(dbfp, cfg->firstseq_key); if (sstatus != eslOK) p7_Fail("Failure setting restrictdb_stkey to %d\n", cfg->firstseq_key); } // if (fprintf(ofp, "Query: %s [M=%d]\n", hmm->name, hmm->M) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); // if (hmm->acc) { if (fprintf(ofp, "Accession: %s\n", hmm->acc) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); } // if (hmm->desc) { if (fprintf(ofp, "Description: %s\n", hmm->desc) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); } /* Convert to an optimized model */ gm = p7_profile_Create (hmm->M, abc); om = p7_oprofile_Create(hmm->M, abc); // p7_ProfileConfig(hmm, info->bg, gm, 100, p7_LOCAL); /* 100 is a dummy length for now; and MSVFilter requires local mode */ p7_oprofile_Convert(gm, om); /* <om> is now p7_LOCAL, multihit */ for (i = 0; i < infocnt; ++i) { /* Create processing pipeline and hit list */ info[i].th = p7_tophits_Create(); info[i].om = p7_oprofile_Clone(om); info[i].pli = p7_pipeline_Create(go, om->M, 100, FALSE, p7_SEARCH_SEQS); /* L_hint = 100 is just a dummy for now */ P7_PIPELINE *pli = info[i].pli; pli->nmodels++; pli->nnodes += info[i].om->M; // if (pli->Z_setby == p7_ZSETBY_NTARGETS && pli->mode == p7_SCAN_MODELS) pli->Z = pli->nmodels; // if (pli->do_biasfilter) p7_bg_SetFilter(info[i].bg, info[i].om->M, info[i].om->compo); // if (pli->mode == p7_SEARCH_SEQS) // status = p7_pli_NewModelThresholds(pli, info[i].om); pli->W = info[i].om->max_length; } sstatus = serial_loop(info, dbfp, cfg->n_targetseq, ofp); switch(sstatus) { case eslEFORMAT: esl_fatal("Parse failed (sequence file %s):\n%s\n", dbfp->filename, esl_sqfile_GetErrorBuf(dbfp)); break; case eslEOF: /* do nothing */ break; default: esl_fatal("Unexpected error %d reading sequence file %s", sstatus, dbfp->filename); } /* merge the results of the search results */ for (i = 1; i < infocnt; ++i) { p7_tophits_Merge(info[0].th, info[i].th); p7_pipeline_Merge(info[0].pli, info[i].pli); p7_pipeline_Destroy(info[i].pli); p7_tophits_Destroy(info[i].th); p7_oprofile_Destroy(info[i].om); } /* Print the results. */ p7_tophits_SortBySortkey(info->th); p7_tophits_Threshold(info->th, info->pli); // p7_tophits_Targets(ofp, info->th, info->pli, textw); if (fprintf(ofp, "\n\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); // p7_tophits_Domains(ofp, info->th, info->pli, textw); if (fprintf(ofp, "\n\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); esl_stopwatch_Stop(w); // p7_pli_Statistics(ofp, info->pli, w); // if (fprintf(ofp, "//\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); p7_pipeline_Destroy(info->pli); p7_tophits_Destroy(info->th); p7_oprofile_Destroy(info->om); p7_oprofile_Destroy(om); p7_profile_Destroy(gm); p7_hmm_Destroy(hmm); // hstatus = p7_hmmfile_Read(hfp, &abc, &hmm); // } /* end outer loop over query HMMs */ switch(hstatus) { case eslEOD: p7_Fail("read failed, HMM file %s may be truncated?", cfg->hmmfile); break; case eslEFORMAT: p7_Fail("bad file format in HMM file %s", cfg->hmmfile); break; case eslEINCOMPAT: p7_Fail("HMM file %s contains different alphabets", cfg->hmmfile); break; case eslEOF: case eslOK: /* do nothing. EOF is what we want. */ break; default: p7_Fail("Unexpected error (%d) in reading HMMs from %s", hstatus, cfg->hmmfile); } /* Terminate outputs... any last words? */ /* Cleanup - prepare for exit */ // for (i = 0; i < infocnt; ++i) // p7_bg_Destroy(info[i].bg); free(info); p7_hmmfile_Close(hfp); esl_sqfile_Close(dbfp); esl_alphabet_Destroy(abc); esl_stopwatch_Destroy(w); if (ofp != stdout) fclose(ofp); printf("44HHHH \n"); return eslOK; ERROR: return eslFAIL; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* command line configuration */ struct cfg_s cfg; /* application configuration */ char *basename= NULL; /* base of the output file names */ char *alifile = NULL; /* alignment file name */ char *dbfile = NULL; /* name of seq db file */ char outfile[256]; /* name of an output file */ int alifmt; /* format code for alifile */ int dbfmt; /* format code for dbfile */ ESLX_MSAFILE *afp = NULL; /* open alignment file */ ESL_MSA *origmsa = NULL; /* one multiple sequence alignment */ ESL_MSA *msa = NULL; /* MSA after frags are removed */ ESL_MSA *trainmsa= NULL; /* training set, aligned */ ESL_STACK *teststack=NULL; /* test set: stack of ESL_SQ ptrs */ int status; /* easel return code */ int nfrags; /* # of fragments removed */ int ntestdom; /* # of test domains */ int ntest; /* # of test sequences created */ int nali; /* number of alignments read */ double avgid; /* Parse command line */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) cmdline_failure(argv[0], "Error in app configuration: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h")) cmdline_help(argv[0], go); if (esl_opt_ArgNumber(go) != 3) cmdline_failure(argv[0], "Incorrect number of command line arguments\n"); basename = esl_opt_GetArg(go, 1); alifile = esl_opt_GetArg(go, 2); dbfile = esl_opt_GetArg(go, 3); alifmt = eslMSAFILE_STOCKHOLM; dbfmt = eslSQFILE_FASTA; /* Set up the configuration structure shared amongst functions here */ if (esl_opt_IsDefault(go, "--seed")) cfg.r = esl_randomness_CreateTimeseeded(); else cfg.r = esl_randomness_Create(esl_opt_GetInteger(go, "--seed")); cfg.abc = NULL; /* until we open the MSA file, below */ cfg.fragfrac = esl_opt_GetReal(go, "-F"); cfg.idthresh1 = esl_opt_GetReal(go, "-1"); cfg.idthresh2 = esl_opt_GetReal(go, "-2"); cfg.test_lens = NULL; cfg.ntest = 0; cfg.max_ntest = (esl_opt_IsOn(go, "--maxtest") ? esl_opt_GetInteger(go, "--maxtest") : 0); cfg.max_ntrain = (esl_opt_IsOn(go, "--maxtrain") ? esl_opt_GetInteger(go, "--maxtrain") : 0); /* Open the output files */ if (snprintf(outfile, 256, "%s.msa", basename) >= 256) esl_fatal("Failed to construct output MSA file name"); if ((cfg.out_msafp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open MSA output file %s\n", outfile); if (snprintf(outfile, 256, "%s.fa", basename) >= 256) esl_fatal("Failed to construct output FASTA file name"); if ((cfg.out_seqfp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open FASTA output file %s\n", outfile); if (snprintf(outfile, 256, "%s.pos", basename) >= 256) esl_fatal("Failed to construct pos test set summary file name"); if ((cfg.possummfp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open pos test set summary file %s\n", outfile); if (snprintf(outfile, 256, "%s.neg", basename) >= 256) esl_fatal("Failed to construct neg test set summary file name"); if ((cfg.negsummfp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open neg test set summary file %s\n", outfile); if (snprintf(outfile, 256, "%s.tbl", basename) >= 256) esl_fatal("Failed to construct benchmark table file name"); if ((cfg.tblfp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open benchmark table file %s\n", outfile); if (esl_opt_GetBoolean(go, "--pid")) { if (snprintf(outfile, 256, "%s.pid", basename) >= 256) esl_fatal("Failed to construct %%id table file name"); if ((cfg.pidfp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open %%id table file %s\n", outfile); } else cfg.pidfp = NULL; /* Open the MSA file, digital mode; determine alphabet */ if (esl_opt_GetBoolean(go, "--amino")) cfg.abc = esl_alphabet_Create(eslAMINO); else if (esl_opt_GetBoolean(go, "--dna")) cfg.abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--rna")) cfg.abc = esl_alphabet_Create(eslRNA); status = eslx_msafile_Open(&(cfg.abc), alifile, NULL, alifmt, NULL, &afp); if (status != eslOK) eslx_msafile_OpenFailure(afp, status); if (cfg.abc->type == eslAMINO) esl_composition_SW34(cfg.fq); else esl_vec_DSet(cfg.fq, cfg.abc->K, 1.0 / (double) cfg.abc->K); /* Open and process the dbfile; make sure it's in the same alphabet */ process_dbfile(&cfg, dbfile, dbfmt); /* Read and process MSAs one at a time */ nali = 0; while ((status = eslx_msafile_Read(afp, &origmsa)) != eslEOF) { if (status != eslOK) eslx_msafile_ReadFailure(afp, status); esl_msa_ConvertDegen2X(origmsa); esl_msa_Hash(origmsa); remove_fragments(&cfg, origmsa, &msa, &nfrags); separate_sets (&cfg, msa, &trainmsa, &teststack); if ( esl_stack_ObjectCount(teststack) >= 2) { /* randomize test domain order, and apply size limit if any */ esl_stack_Shuffle(cfg.r, teststack); if (cfg.max_ntest) pstack_select_topn(&teststack, cfg.max_ntest); ntestdom = esl_stack_ObjectCount(teststack); /* randomize training set alignment order, and apply size limit if any */ esl_msashuffle_PermuteSequenceOrder(cfg.r, trainmsa); if (cfg.max_ntrain) msa_select_topn(&trainmsa, cfg.max_ntrain); esl_msa_MinimGaps(trainmsa, NULL, NULL, FALSE); if (esl_opt_GetBoolean(go, "--pid")) write_pids(cfg.pidfp, origmsa, trainmsa, teststack); synthesize_positives(go, &cfg, msa->name, teststack, &ntest); eslx_msafile_Write(cfg.out_msafp, trainmsa, eslMSAFILE_STOCKHOLM); esl_dst_XAverageId(cfg.abc, trainmsa->ax, trainmsa->nseq, 10000, &avgid); /* 10000 is max_comparisons, before sampling kicks in */ fprintf(cfg.tblfp, "%-20s %3.0f%% %6d %6d %6d %6d %6d %6d\n", msa->name, 100.*avgid, (int) trainmsa->alen, msa->nseq, nfrags, trainmsa->nseq, ntestdom, ntest); nali++; } esl_msa_Destroy(trainmsa); esl_msa_Destroy(origmsa); esl_msa_Destroy(msa); } if (nali == 0) esl_fatal("No alignments found in file %s\n", alifile); synthesize_negatives(go, &cfg, esl_opt_GetInteger(go, "-N")); fclose(cfg.out_msafp); fclose(cfg.out_seqfp); fclose(cfg.possummfp); fclose(cfg.negsummfp); fclose(cfg.tblfp); if (cfg.pidfp) fclose(cfg.pidfp); esl_randomness_Destroy(cfg.r); esl_alphabet_Destroy(cfg.abc); eslx_msafile_Close(afp); esl_getopts_Destroy(go); return 0; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* application configuration */ char *hmmfile = NULL; /* HMM file name */ char *seqfile = NULL; /* sequence file name */ char *mapfile = NULL; /* optional mapped MSA file name */ int infmt = eslSQFILE_UNKNOWN; int outfmt = eslMSAFILE_STOCKHOLM; P7_HMMFILE *hfp = NULL; /* open HMM file */ ESL_SQFILE *sqfp = NULL; /* open sequence file */ char *outfile = NULL; /* output filename */ FILE *ofp = stdout; /* output stream */ ESL_SQ **sq = NULL; /* array of sequences */ void *p = NULL; /* tmp ptr for reallocation */ int nseq = 0; /* # of sequences in <seqfile> */ int mapseq = 0; /* # of sequences in mapped MSA */ int totseq = 0; /* # of seqs in all sources */ ESL_ALPHABET *abc = NULL; /* alphabet (set from the HMM file)*/ P7_HMM *hmm = NULL; P7_TRACE **tr = NULL; /* array of tracebacks */ ESL_MSA *msa = NULL; /* resulting multiple alignment */ int msaopts = 0; /* flags to p7_tracealign_Seqs() */ int idx; /* counter over seqs, traces */ int status; /* easel/hmmer return code */ char errbuf[eslERRBUFSIZE]; /* Parse the command line */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) cmdline_failure(argv[0], "Error in configuration: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h") ) cmdline_help (argv[0], go); if (esl_opt_ArgNumber(go) != 2) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n"); hmmfile = esl_opt_GetArg(go, 1); seqfile = esl_opt_GetArg(go, 2); if (strcmp(hmmfile, "-") == 0 && strcmp(seqfile, "-") == 0) cmdline_failure(argv[0], "Either <hmmfile> or <seqfile> may be '-' (to read from stdin), but not both.\n"); msaopts |= p7_ALL_CONSENSUS_COLS; /* default as of 3.1 */ if (esl_opt_GetBoolean(go, "--trim")) msaopts |= p7_TRIM; /* If caller declared an input format, decode it */ if (esl_opt_IsOn(go, "--informat")) { infmt = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--informat")); if (infmt == eslSQFILE_UNKNOWN) cmdline_failure(argv[0], "%s is not a recognized input sequence file format\n", esl_opt_GetString(go, "--informat")); } /* Determine output alignment file format */ outfmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--outformat")); if (outfmt == eslMSAFILE_UNKNOWN) cmdline_failure(argv[0], "%s is not a recognized output MSA file format\n", esl_opt_GetString(go, "--outformat")); /* Open output stream */ if ( (outfile = esl_opt_GetString(go, "-o")) != NULL) { if ((ofp = fopen(outfile, "w")) == NULL) cmdline_failure(argv[0], "failed to open -o output file %s for writing\n", outfile); } /* If caller forced an alphabet on us, create the one the caller wants */ if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO); else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA); /* Read one HMM, and make sure there's only one. */ status = p7_hmmfile_OpenE(hmmfile, NULL, &hfp, errbuf); if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status == eslEFORMAT) p7_Fail("File format problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, hmmfile, errbuf); status = p7_hmmfile_Read(hfp, &abc, &hmm); if (status == eslEFORMAT) p7_Fail("Bad file format in HMM file %s:\n%s\n", hfp->fname, hfp->errbuf); else if (status == eslEINCOMPAT) p7_Fail("HMM in %s is not in the expected %s alphabet\n", hfp->fname, esl_abc_DecodeType(abc->type)); else if (status == eslEOF) p7_Fail("Empty HMM file %s? No HMM data found.\n", hfp->fname); else if (status != eslOK) p7_Fail("Unexpected error in reading HMMs from %s\n", hfp->fname); status = p7_hmmfile_Read(hfp, &abc, NULL); if (status != eslEOF) p7_Fail("HMM file %s does not contain just one HMM\n", hfp->fname); p7_hmmfile_Close(hfp); /* We're going to build up two arrays: sequences and traces. * If --mapali option is chosen, the first set of sequences/traces is from the provided alignment */ if ( (mapfile = esl_opt_GetString(go, "--mapali")) != NULL) { map_alignment(mapfile, hmm, &sq, &tr, &mapseq); } totseq = mapseq; /* Read digital sequences into an array (possibly concat'ed onto mapped seqs) */ status = esl_sqfile_OpenDigital(abc, seqfile, infmt, NULL, &sqfp); if (status == eslENOTFOUND) p7_Fail("Failed to open sequence file %s for reading\n", seqfile); else if (status == eslEFORMAT) p7_Fail("Sequence file %s is empty or misformatted\n", seqfile); else if (status != eslOK) p7_Fail("Unexpected error %d opening sequence file %s\n", status, seqfile); ESL_RALLOC(sq, p, sizeof(ESL_SQ *) * (totseq + 1)); sq[totseq] = esl_sq_CreateDigital(abc); nseq = 0; while ((status = esl_sqio_Read(sqfp, sq[totseq+nseq])) == eslOK) { nseq++; ESL_RALLOC(sq, p, sizeof(ESL_SQ *) * (totseq+nseq+1)); sq[totseq+nseq] = esl_sq_CreateDigital(abc); } if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n", sqfp->filename, esl_sqfile_GetErrorBuf(sqfp)); else if (status != eslEOF) esl_fatal("Unexpected error %d reading sequence file %s", status, sqfp->filename); esl_sqfile_Close(sqfp); totseq += nseq; /* Remaining initializations, including trace array allocation */ ESL_RALLOC(tr, p, sizeof(P7_TRACE *) * totseq); for (idx = mapseq; idx < totseq; idx++) tr[idx] = p7_trace_CreateWithPP(); p7_tracealign_computeTraces(hmm, sq, mapseq, totseq - mapseq, tr); p7_tracealign_Seqs(sq, tr, totseq, hmm->M, msaopts, hmm, &msa); eslx_msafile_Write(ofp, msa, outfmt); for (idx = 0; idx <= totseq; idx++) esl_sq_Destroy(sq[idx]); /* including sq[nseq] because we overallocated */ for (idx = 0; idx < totseq; idx++) p7_trace_Destroy(tr[idx]); free(sq); free(tr); esl_msa_Destroy(msa); p7_hmm_Destroy(hmm); if (ofp != stdout) fclose(ofp); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return eslOK; ERROR: return status; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* application configuration */ ESL_ALPHABET *abc = NULL; /* biological alphabet */ char *alifile = NULL; /* alignment file name */ int fmt = eslMSAFILE_UNKNOWN; /* format code for alifile */ ESLX_MSAFILE *afp = NULL; /* open msa file */ ESL_MSAFILE2 *old_afp = NULL; /* open msa file, legacy (--small) */ ESL_MSA *msa = NULL; /* one multiple sequence alignment */ int nali; /* number of alignments read */ int i; /* counter over seqs */ int64_t alen; /* alignment length */ int nseq; /* number of sequences in the msa */ int64_t rlen; /* a raw (unaligned) seq length */ int64_t small, large; /* smallest, largest sequence */ int64_t nres; /* total # of residues in msa */ double avgid; /* average fractional pair id */ int max_comparisons; /* maximum # comparisons for avg id */ int do_stall; /* used to stall when debugging */ double **abc_ct = NULL; /* [0..msa->alen-1][0..abc->K] number of each residue at each position (abc->K is gap) */ double ***bp_ct = NULL; /* [0..msa->alen-1][0..abc->Kp-1][0..abc->Kp-1] per (non-pknotted) consensus basepair * * count of each possible basepair over all seqs basepairs are indexed by 'i' the minimum * * of 'i:j' for a pair between i and j, where i < j. */ double **pp_ct = NULL; /* [0..msa->alen-1][0..11], count of each posterior probability (PP) code, over all sequences, gap is 11 */ int *i_am_rf = NULL; /* [0..i..msa->alen-1]: TRUE if pos i is non-gap RF posn, if msa->rf == NULL remains NULL */ int *rf2a_map = NULL; /* [0..rfpos..rflen-1] = apos, * apos is the alignment position (0..msa->alen-1) that * is non-gap RF position rfpos+1 (for rfpos in 0..rflen-1) */ int rflen = -1; /* nongap RF length */ char errbuf[eslERRBUFSIZE]; int status; /* easel return code */ /* optional output files */ FILE *iinfofp = NULL; /* output file for --iinfo */ FILE *pcinfofp = NULL; /* output file for --pcinfo */ FILE *psinfofp = NULL; /* output file for --psinfo */ FILE *rinfofp = NULL; /* output file for --rinfo */ FILE *icinfofp = NULL; /* output file for --icinfo */ FILE *listfp = NULL; /* output file for --list */ FILE *cinfofp = NULL; /* output file for --cinfo */ FILE *bpinfofp = NULL; /* output file for --bpinfo */ int use_weights; /* TRUE if --weight, reported weighted counts (using msa->wgt) to all output files */ int weights_exist; /* TRUE if at least one msa->wgt value differs from 1.0, FALSE if not (or if msa->wgt==NULL) */ /*********************************************** * Parse command line ***********************************************/ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK || esl_opt_VerifyConfig(go) != eslOK) { printf("Failed to parse command line: %s\n", go->errbuf); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if (esl_opt_GetBoolean(go, "-h") ) { esl_banner(stdout, argv[0], banner); esl_usage (stdout, argv[0], usage); puts("\n where options are:"); esl_opt_DisplayHelp(stdout, go, 1, 2, 80); puts("\n small memory mode, requires --amino,--dna, or --rna and --informat pfam:"); esl_opt_DisplayHelp(stdout, go, 2, 2, 80); puts("\n optional output files:"); esl_opt_DisplayHelp(stdout, go, 3, 2, 80); exit(0); } if (esl_opt_ArgNumber(go) != 1) { printf("Incorrect number of command line arguments.\n"); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } alifile = esl_opt_GetArg(go, 1); if (esl_opt_IsOn(go, "--informat") && (fmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--informat"))) == eslMSAFILE_UNKNOWN) esl_fatal("%s is not a valid input sequence file format for --informat", esl_opt_GetString(go, "--informat")); if (esl_opt_GetBoolean(go, "--small") && fmt != eslMSAFILE_PFAM) esl_fatal("--small requires --informat pfam\n"); max_comparisons = 1000; do_stall = esl_opt_GetBoolean(go, "--stall"); /* a stall point for attaching gdb */ while (do_stall); /*********************************************** * Open the MSA file; determine alphabet; set for digital input ***********************************************/ if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO); else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA); /* We'd like to get rid of the legacy msafile interface, but it * includes small memory functionality for Pfam format which we have * to replace first. For now, use both interfaces, new and legacy */ if ( esl_opt_GetBoolean(go, "--small") ) { if (! abc) esl_fatal("--small requires one of --amino, --dna, --rna be specified."); status = esl_msafile2_OpenDigital(abc, alifile, NULL, &old_afp); if (status == eslENOTFOUND) esl_fatal("Alignment file %s doesn't exist or is not readable\n", alifile); else if (status == eslEFORMAT) esl_fatal("Couldn't determine format of alignment %s\n", alifile); else if (status != eslOK) esl_fatal("Alignment file open failed with error %d\n", status); } else { if ( (status = eslx_msafile_Open(&abc, alifile, NULL, fmt, NULL, &afp)) != eslOK) eslx_msafile_OpenFailure(afp, status); } /************************************** * Open optional output files, as nec * **************************************/ /* determine name for first list file, if nec */ if( esl_opt_IsOn(go, "--list")) { if ((listfp = fopen(esl_opt_GetString(go, "--list"), "w")) == NULL) esl_fatal("Failed to open --list output file %s\n", esl_opt_GetString(go, "--list")); } if( esl_opt_IsOn(go, "--icinfo")) { if ((icinfofp = fopen(esl_opt_GetString(go, "--icinfo"), "w")) == NULL) esl_fatal("Failed to open --icinfo output file %s\n", esl_opt_GetString(go, "--icinfo")); } if( esl_opt_IsOn(go, "--rinfo")) { if ((rinfofp = fopen(esl_opt_GetString(go, "--rinfo"), "w")) == NULL) esl_fatal("Failed to open --rinfo output file %s\n", esl_opt_GetString(go, "--rinfo")); } if( esl_opt_IsOn(go, "--pcinfo")) { if ((pcinfofp = fopen(esl_opt_GetString(go, "--pcinfo"), "w")) == NULL) esl_fatal("Failed to open --pcinfo output file %s\n", esl_opt_GetString(go, "--pcinfo")); } if( esl_opt_IsOn(go, "--psinfo")) { if ((psinfofp = fopen(esl_opt_GetString(go, "--psinfo"), "w")) == NULL) esl_fatal("Failed to open --psinfo output file %s\n", esl_opt_GetString(go, "--psinfo")); } if( esl_opt_IsOn(go, "--iinfo")) { if ((iinfofp = fopen(esl_opt_GetString(go, "--iinfo"), "w")) == NULL) esl_fatal("Failed to open --iinfo output file %s\n", esl_opt_GetString(go, "--iinfo")); } if( esl_opt_IsOn(go, "--cinfo")) { if ((cinfofp = fopen(esl_opt_GetString(go, "--cinfo"), "w")) == NULL) esl_fatal("Failed to open --cinfo output file %s\n", esl_opt_GetString(go, "--cinfo")); } if( esl_opt_IsOn(go, "--bpinfo")) { if ((bpinfofp = fopen(esl_opt_GetString(go, "--bpinfo"), "w")) == NULL) esl_fatal("Failed to open --bpinfo output file %s\n", esl_opt_GetString(go, "--bpinfo")); } /*********************************************** * Read MSAs one at a time. ***********************************************/ if (esl_opt_GetBoolean(go, "-1")) { puts("#"); if(! esl_opt_GetBoolean(go, "--small")) { printf("# %-4s %-20s %10s %7s %7s %12s %6s %6s %10s %3s\n", "idx", "name", "format", "nseq", "alen", "nres", "small", "large", "avlen", "%id"); printf("# %-4s %-20s %10s %7s %7s %12s %6s %6s %10s %3s\n", "----", "--------------------", "----------", "-------", "-------", "------------", "------", "------", "----------", "---"); } else { printf("# %-4s %-20s %10s %7s %7s %12s %10s\n", "idx", "name", "format", "nseq", "alen", "nres", "avlen"); printf("# %-4s %-20s %10s %7s %7s %12s %10s\n", "----", "--------------------", "----------", "-------", "-------", "------------", "----------"); } } nali = 0; fmt = (esl_opt_GetBoolean(go, "--small") ? old_afp->format : afp->format); while ( (status = ( esl_opt_GetBoolean(go, "--small") ? esl_msafile2_ReadInfoPfam(old_afp, listfp, abc, -1, NULL, NULL, &msa, &nseq, &alen, NULL, NULL, NULL, NULL, NULL, &abc_ct, &pp_ct, NULL, NULL, NULL) : eslx_msafile_Read (afp, &msa))) == eslOK) { nali++; nres = 0; if (! esl_opt_GetBoolean(go, "--small")) { nseq = msa->nseq; alen = msa->alen; small = large = -1; for (i = 0; i < msa->nseq; i++) { rlen = esl_abc_dsqrlen(msa->abc, msa->ax[i]); nres += rlen; if (small == -1 || rlen < small) small = rlen; if (large == -1 || rlen > large) large = rlen; } esl_dst_XAverageId(abc, msa->ax, msa->nseq, max_comparisons, &avgid); } else { /* --small invoked */ for(i = 0; i < alen; i++) nres += (int) esl_vec_DSum(abc_ct[i], abc->K); } if (esl_opt_GetBoolean(go, "-1")) { printf("%-6d %-20s %10s %7d %7" PRId64 " %12" PRId64, nali, msa->name, eslx_msafile_DecodeFormat(fmt), nseq, alen, nres); if (! esl_opt_GetBoolean(go, "--small")) { printf(" %6" PRId64 " %6" PRId64 " %10.1f %3.0f\n", small, large, (double) nres / (double) msa->nseq, 100.*avgid); } else { printf(" %10.1f\n", (double) nres / (double) nseq); } } else { printf("Alignment number: %d\n", nali); if (msa->name != NULL) printf("Alignment name: %s\n", msa->name); printf("Format: %s\n", eslx_msafile_DecodeFormat(fmt)); printf("Number of sequences: %d\n", nseq); printf("Alignment length: %" PRId64 "\n", alen); printf("Total # residues: %" PRId64 "\n", nres); if(! esl_opt_GetBoolean(go, "--small")) { printf("Smallest: %" PRId64 "\n", small); printf("Largest: %" PRId64 "\n", large); } printf("Average length: %.1f\n", (double) nres / (double) nseq); if(! esl_opt_GetBoolean(go, "--small")) { printf("Average identity: %.0f%%\n", 100.*avgid); } printf("//\n"); } /* Dump data to optional output files, if nec */ if(esl_opt_IsOn(go, "--list")) { if(! esl_opt_GetBoolean(go, "--small")) { /* only print sequence name to list file if ! --small, else we already have in esl_msafile2_ReadInfoPfam() */ for(i = 0; i < msa->nseq; i++) fprintf(listfp, "%s\n", msa->sqname[i]); } } /* if RF exists, get i_am_rf array[0..alen] which tells us which positions are non-gap RF positions * and rf2a_map, a map of non-gap RF positions to overall alignment positions */ if(msa->rf != NULL) { if((status = map_rfpos_to_apos(msa, abc, errbuf, alen, &i_am_rf, &rf2a_map, &rflen)) != eslOK) esl_fatal(errbuf); } else i_am_rf = NULL; weights_exist = check_msa_weights(msa); use_weights = (weights_exist && esl_opt_GetBoolean(go, "--weight")) ? TRUE : FALSE; if( (! esl_opt_GetBoolean(go, "--small")) && (esl_opt_IsOn(go, "--icinfo") || esl_opt_IsOn(go, "--rinfo") || esl_opt_IsOn(go, "--pcinfo") || esl_opt_IsOn(go, "--cinfo") || esl_opt_IsOn(go, "--bpinfo"))) { /* collect counts of each residue and PPs (if they exist) from the msa */ if(esl_opt_GetBoolean(go, "--weight") && msa->wgt == NULL) esl_fatal("--weight requires all alignments have #=GS WT annotation, but aln %d does not", nali); if((status = count_msa(msa, errbuf, nali, esl_opt_GetBoolean(go, "--noambig"), /* ignore ambiguous residues? */ esl_opt_GetBoolean(go, "--weight"), /* use msa->wgt sequence weights? */ &abc_ct, ((bpinfofp != NULL && msa->ss_cons != NULL) ? &bp_ct : NULL), /* get basepair counts? */ (msa->pp != NULL ? &pp_ct : NULL))) /* get PP counts? */ != eslOK) esl_fatal(errbuf); } if( esl_opt_IsOn(go, "--icinfo")) { if((status = dump_infocontent_info(icinfofp, abc, abc_ct, use_weights, nali, alen, nseq, i_am_rf, msa->name, alifile, errbuf) != eslOK)) esl_fatal(errbuf); } if( esl_opt_IsOn(go, "--rinfo")) { if((status = dump_residue_info(rinfofp, abc, abc_ct, use_weights, nali, alen, nseq, i_am_rf, msa->name, alifile, errbuf) != eslOK)) esl_fatal(errbuf); } if(esl_opt_IsOn(go, "--pcinfo")) { if(pp_ct == NULL) esl_fatal("Error: --pcinfo requires all alignments have #=GR PP annotation, but alignment %d does not", nali); if((status = dump_posterior_column_info(pcinfofp, pp_ct, use_weights, nali, alen, nseq, i_am_rf, msa->name, alifile, errbuf) != eslOK)) esl_fatal(errbuf); } if(esl_opt_IsOn(go, "--psinfo")) { if(msa->pp == NULL) esl_fatal("Error: --psinfo requires all alignments have #=GR PP annotation, but alignment %d does not", nali); if((status = dump_posterior_sequence_info(psinfofp, msa, nali, alifile, errbuf) != eslOK)) esl_fatal(errbuf); } if( esl_opt_IsOn(go, "--iinfo")) { if(msa->rf == NULL) esl_fatal("--iinfo requires all alignments have #=GC RF annotation, but alignment %d does not", nali); if(esl_opt_GetBoolean(go, "--weight") && msa->wgt == NULL) esl_fatal("--weight requires all alignments have #=GS WT annotation, but aln %d does not", nali); if((status = dump_insert_info(iinfofp, msa, use_weights, nali, i_am_rf, alifile, errbuf) != eslOK)) esl_fatal(errbuf); } if( esl_opt_IsOn(go, "--cinfo")) { if((status = dump_column_residue_counts(cinfofp, abc, abc_ct, esl_opt_GetBoolean(go, "--noambig"), use_weights, nali, alen, nseq, msa->name, alifile, errbuf) != eslOK)) esl_fatal(errbuf); } if( esl_opt_IsOn(go, "--bpinfo")) { if(msa->ss_cons == NULL) esl_fatal("--bpinfo requires all alignments have #=GC SS_cons annotation, but alignment %d does not", nali); if((status = dump_basepair_counts(bpinfofp, msa, abc, bp_ct, use_weights, nali, nseq, msa->name, alifile, errbuf) != eslOK)) esl_fatal(errbuf); } esl_msa_Destroy(msa); if(abc_ct != NULL) { esl_Free2D((void **) abc_ct, alen); abc_ct = NULL; } if(bp_ct != NULL) { esl_Free3D((void ***) bp_ct, alen, abc->Kp); bp_ct = NULL; } if(pp_ct != NULL) { esl_Free2D((void **) pp_ct, alen); pp_ct = NULL; } if(i_am_rf != NULL) { free(i_am_rf); i_am_rf = NULL; } if(rf2a_map != NULL) { free(rf2a_map); rf2a_map = NULL; } } /* If an msa read failed, we've dropped out to here with an informative status code. * we have to handle failures from new vs. legacy msa parsing differently */ if (esl_opt_GetBoolean(go, "--small")) { if (status == eslEFORMAT) esl_fatal("Alignment file parse error, line %d of file %s:\n%s\nOffending line is:\n%s\n", old_afp->linenumber, old_afp->fname, old_afp->errbuf, old_afp->buf); else if (status != eslEOF) esl_fatal("Alignment file read failed with error code %d\n", status); else if (nali == 0) esl_fatal("No alignments found in file %s\n", alifile); } else { if (nali == 0 || status != eslEOF) eslx_msafile_ReadFailure(afp, status); } /* Cleanup, normal return */ if(listfp != NULL) { fclose(listfp); printf("# List of sequences in %d alignment(s) saved to file %s\n", nali, esl_opt_GetString(go, "--list")); } if(icinfofp != NULL) { fclose(icinfofp); printf("# Information content data saved to file %s.\n", esl_opt_GetString(go, "--icinfo")); } if(rinfofp != NULL) { fclose(rinfofp); printf("# Residue data saved to file %s.\n", esl_opt_GetString(go, "--rinfo")); } if(pcinfofp != NULL) { fclose(pcinfofp); printf("# Per-column posterior probability data saved to file %s.\n", esl_opt_GetString(go, "--pcinfo")); } if(psinfofp != NULL) { fclose(psinfofp); printf("# Per-sequence posterior probability data saved to file %s.\n", esl_opt_GetString(go, "--psinfo")); } if(iinfofp != NULL) { printf("# Insert data saved to file %s.\n", esl_opt_GetString(go, "--iinfo")); fclose(iinfofp); } if(cinfofp != NULL) { printf("# Per-column counts data saved to file %s.\n", esl_opt_GetString(go, "--cinfo")); fclose(cinfofp); } if(bpinfofp != NULL) { printf("# Per-column basepair counts data saved to file %s.\n", esl_opt_GetString(go, "--bpinfo")); fclose(bpinfofp); } if (afp) eslx_msafile_Close(afp); if (old_afp) esl_msafile2_Close(old_afp); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return 0; }
/* Function: p7_builder_Create() * Synopsis: Create a default HMM construction configuration. * * Purpose: Create a construction configuration for building * HMMs in alphabet <abc>, and return a pointer to it. * * An application configuration <go> may optionally be * provided. If <go> is <NULL>, default parameters are * used. If <go> is non-<NULL>, it must include appropriate * settings for all of the following ``standard build options'': * * Model construction: --fast --hand --symfrac --fragthresh * Relative weighting: --wgsc --wblosum --wpb --wgiven --wid * Effective seq #: --eent --eclust --enone --eset --ere --esigma --eid * Prior scheme: --pnone --plaplace * E-val calibration: --EmL --EmN --EvL --EvN --EfL --EfN --Eft * run-to-run variation: --seed * * See <hmmbuild.c> or other big users of the build * pipeline for an example of appropriate <ESL_GETOPTS> * initializations of these 24 options. */ P7_BUILDER * p7_builder_Create(const ESL_GETOPTS *go, const ESL_ALPHABET *abc) { P7_BUILDER *bld = NULL; int seed; int status; ESL_ALLOC(bld, sizeof(P7_BUILDER)); bld->prior = NULL; bld->r = NULL; bld->S = NULL; bld->Q = NULL; bld->eset = -1.0; /* -1.0 = unset; must be set if effn_strategy is p7_EFFN_SET */ bld->re_target = -1.0; if (go == NULL) { bld->arch_strategy = p7_ARCH_FAST; bld->wgt_strategy = p7_WGT_PB; bld->effn_strategy = p7_EFFN_ENTROPY; seed = 42; } else { if (esl_opt_GetBoolean(go, "--fast")) bld->arch_strategy = p7_ARCH_FAST; else if (esl_opt_GetBoolean(go, "--hand")) bld->arch_strategy = p7_ARCH_HAND; if (esl_opt_GetBoolean(go, "--wpb")) bld->wgt_strategy = p7_WGT_PB; else if (esl_opt_GetBoolean(go, "--wgsc")) bld->wgt_strategy = p7_WGT_GSC; else if (esl_opt_GetBoolean(go, "--wblosum")) bld->wgt_strategy = p7_WGT_BLOSUM; else if (esl_opt_GetBoolean(go, "--wnone")) bld->wgt_strategy = p7_WGT_NONE; else if (esl_opt_GetBoolean(go, "--wgiven")) bld->wgt_strategy = p7_WGT_GIVEN; if (esl_opt_GetBoolean(go, "--eent")) bld->effn_strategy = p7_EFFN_ENTROPY; else if (esl_opt_GetBoolean(go, "--eclust")) bld->effn_strategy = p7_EFFN_CLUST; else if (esl_opt_GetBoolean(go, "--enone")) bld->effn_strategy = p7_EFFN_NONE; else if (esl_opt_IsOn (go, "--eset")) { bld->effn_strategy = p7_EFFN_SET; bld->eset = esl_opt_GetReal(go, "--eset"); } seed = esl_opt_GetInteger(go, "--seed"); } bld->max_insert_len = 0; /* The default RE target is alphabet dependent. */ if (go != NULL && esl_opt_IsOn (go, "--ere")) bld->re_target = esl_opt_GetReal(go, "--ere"); else { switch (abc->type) { case eslAMINO: bld->re_target = p7_ETARGET_AMINO; break; case eslDNA: bld->re_target = p7_ETARGET_DNA; break; case eslRNA: bld->re_target = p7_ETARGET_DNA; break; default: bld->re_target = p7_ETARGET_OTHER; break; } } bld->symfrac = (go != NULL) ? esl_opt_GetReal (go, "--symfrac") : 0.5; bld->fragthresh = (go != NULL) ? esl_opt_GetReal (go, "--fragthresh") : 0.5; bld->wid = (go != NULL) ? esl_opt_GetReal (go, "--wid") : 0.62; bld->esigma = (go != NULL) ? esl_opt_GetReal (go, "--esigma") : 45.0; bld->eid = (go != NULL) ? esl_opt_GetReal (go, "--eid") : 0.62; bld->EmL = (go != NULL) ? esl_opt_GetInteger(go, "--EmL") : 200; bld->EmN = (go != NULL) ? esl_opt_GetInteger(go, "--EmN") : 200; bld->EvL = (go != NULL) ? esl_opt_GetInteger(go, "--EvL") : 200; bld->EvN = (go != NULL) ? esl_opt_GetInteger(go, "--EvN") : 200; bld->EfL = (go != NULL) ? esl_opt_GetInteger(go, "--EfL") : 100; bld->EfN = (go != NULL) ? esl_opt_GetInteger(go, "--EfN") : 200; bld->Eft = (go != NULL) ? esl_opt_GetReal (go, "--Eft") : 0.04; /* Normally we reinitialize the RNG to original seed before calibrating each model. * This eliminates run-to-run variation. * As a special case, seed==0 means choose an arbitrary seed and shut off the * reinitialization; this allows run-to-run variation. */ bld->r = esl_randomness_CreateFast(seed); bld->do_reseeding = (seed == 0) ? FALSE : TRUE; if (go && esl_opt_GetBoolean(go, "--pnone") ) bld->prior = NULL; else if (go && esl_opt_GetBoolean(go, "--plaplace") ) bld->prior = p7_prior_CreateLaplace(abc); else { switch (abc->type) { case eslAMINO: bld->prior = p7_prior_CreateAmino(); break; case eslDNA: bld->prior = p7_prior_CreateNucleic(); break; case eslRNA: bld->prior = p7_prior_CreateNucleic(); break; default: bld->prior = p7_prior_CreateLaplace(abc); break; } if (bld->prior == NULL) goto ERROR; } bld->abc = abc; bld->errbuf[0] = '\0'; bld->popen = -1; bld->pextend = -1; return bld; ERROR: p7_builder_Destroy(bld); return NULL; }
int main(int argc, char **argv) { ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage); double mu = esl_opt_GetReal(go, "-m"); double lambda = esl_opt_GetReal(go, "-l"); double tau = esl_opt_GetReal(go, "-t"); ESL_RANDOMNESS *r = esl_randomness_Create(esl_opt_GetInteger(go, "-s")); ESL_HISTOGRAM *h = esl_histogram_CreateFull(mu, 100., esl_opt_GetReal(go, "-w"));; int n = esl_opt_GetInteger(go, "-n"); int be_verbose = esl_opt_GetBoolean(go, "-v"); char *plotfile = esl_opt_GetString(go, "-o"); FILE *pfp = stdout; int plot_pdf = esl_opt_GetBoolean(go, "--P"); int plot_logpdf = esl_opt_GetBoolean(go, "--LP"); int plot_cdf = esl_opt_GetBoolean(go, "--C"); int plot_logcdf = esl_opt_GetBoolean(go, "--LC"); int plot_surv = esl_opt_GetBoolean(go, "--S"); int plot_logsurv = esl_opt_GetBoolean(go, "--LS"); double xmin = esl_opt_IsOn(go, "--XL") ? esl_opt_GetReal(go, "--XL") : mu; double xmax = esl_opt_IsOn(go, "--XH") ? esl_opt_GetReal(go, "--XH") : mu+40*(1./lambda); double xstep = esl_opt_IsOn(go, "--XS") ? esl_opt_GetReal(go, "--XS") : 0.1; double emu, elambda, etau; int i; double x; double *data; int ndata; if (be_verbose) printf("Parametric: mu = %f lambda = %f tau = %f\n", mu, lambda, tau); if (plotfile != NULL) { if ((pfp = fopen(plotfile, "w")) == NULL) esl_fatal("Failed to open plotfile"); } for (i = 0; i < n; i++) { x = esl_sxp_Sample(r, mu, lambda, tau); esl_histogram_Add(h, x); } esl_histogram_GetData(h, &data, &ndata); esl_sxp_FitComplete(data, ndata, &emu, &elambda, &etau); if (be_verbose) printf("Complete data fit: mu = %f lambda = %f tau = %f\n", emu, elambda, etau); if (fabs( (emu-mu)/mu ) > 0.01) esl_fatal("Error in (complete) fitted mu > 1%\n"); if (fabs( (elambda-lambda)/lambda ) > 0.10) esl_fatal("Error in (complete) fitted lambda > 10%\n"); if (fabs( (etau-tau)/tau ) > 0.10) esl_fatal("Error in (complete) fitted tau > 10%\n"); esl_sxp_FitCompleteBinned(h, &emu, &elambda, &etau); if (be_verbose) printf("Binned data fit: mu = %f lambda = %f tau = %f\n", emu, elambda, etau); if (fabs( (emu-mu)/mu ) > 0.01) esl_fatal("Error in (binned) fitted mu > 1%\n"); if (fabs( (elambda-lambda)/lambda ) > 0.10) esl_fatal("Error in (binned) fitted lambda > 10%\n"); if (fabs( (etau-tau)/tau ) > 0.10) esl_fatal("Error in (binned) fitted tau > 10%\n"); if (plot_pdf) esl_sxp_Plot(pfp, mu, lambda, tau, &esl_sxp_pdf, xmin, xmax, xstep); if (plot_logpdf) esl_sxp_Plot(pfp, mu, lambda, tau, &esl_sxp_logpdf, xmin, xmax, xstep); if (plot_cdf) esl_sxp_Plot(pfp, mu, lambda, tau, &esl_sxp_cdf, xmin, xmax, xstep); if (plot_logcdf) esl_sxp_Plot(pfp, mu, lambda, tau, &esl_sxp_logcdf, xmin, xmax, xstep); if (plot_surv) esl_sxp_Plot(pfp, mu, lambda, tau, &esl_sxp_surv, xmin, xmax, xstep); if (plot_logsurv) esl_sxp_Plot(pfp, mu, lambda, tau, &esl_sxp_logsurv, xmin, xmax, xstep); if (plotfile != NULL) fclose(pfp); esl_histogram_Destroy(h); esl_randomness_Destroy(r); esl_getopts_Destroy(go); return 0; }
/* seq_shuffling() * SRE, Tue Jan 22 08:35:51 2008 [Market Street Cafe, Leesburg] * * Shuffling of input sequences. * * Fixed-length (L>0) vs. full-length (L=0) modes handled differently. * In fixed-length mode: * <shuff->seq> only needs to be allocated once, for L * <targ> is an allocated copy of a random subseq of length L * sequences < L residues long can't be shuffled * In full-length mode: * <shuff->seq> is grown to length <sq->n> for each input seq * <targ> just points to <sq->seq> */ static int seq_shuffling(ESL_GETOPTS *go, ESL_RANDOMNESS *r, FILE *ofp, int outfmt) { char *seqfile = esl_opt_GetArg(go, 1); int infmt = eslSQFILE_UNKNOWN; ESL_SQFILE *sqfp = NULL; ESL_SQ *sq = esl_sq_Create(); ESL_SQ *shuff = esl_sq_Create(); char *targ = NULL; int N = esl_opt_GetInteger(go, "-N"); int L = esl_opt_GetInteger(go, "-L"); /* L>0 means select random fixed-len subseqs */ int kmers = 0; int i; int status; if (esl_opt_GetString(go, "--informat") != NULL) { infmt = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--informat")); if (infmt == eslSQFILE_UNKNOWN) esl_fatal("%s is not a valid input sequence file format for --informat"); } if (esl_opt_IsOn(go, "-k")) kmers = esl_opt_GetInteger(go, "-k"); status = esl_sqfile_Open(seqfile, infmt, NULL, &sqfp); if (status == eslENOTFOUND) esl_fatal("No such file %s", seqfile); else if (status == eslEFORMAT) esl_fatal("Format of seqfile %s unrecognized.", seqfile); else if (status == eslEINVAL) esl_fatal("Can't autodetect stdin or .gz."); else if (status != eslOK) esl_fatal("Open failed, code %d.", status); if (L>0) { esl_sq_GrowTo(shuff, L); shuff->n = L; ESL_ALLOC(targ, sizeof(char) * (L+1)); } while ((status = esl_sqio_Read(sqfp, sq)) == eslOK) { if (L == 0) { /* shuffling entire sequence */ esl_sq_GrowTo(shuff, sq->n); /* make sure shuff can hold sq */ shuff->n = sq->n; targ = sq->seq; } else { if (sq->n < L) continue; /* reject seqs < L long */ } for (i = 0; i < N; i++) { if (L > 0) { /* fixed-len mode: copy a random subseq */ int pos = esl_rnd_Roll(r, sq->n - L + 1); strncpy(targ, sq->seq + pos, L); targ[L] = '\0'; } /* Do the requested kind of shuffling */ if (esl_opt_GetBoolean(go, "-m")) esl_rsq_CShuffle (r, targ, shuff->seq); /* monoresidue shuffling */ else if (esl_opt_GetBoolean(go, "-d")) esl_rsq_CShuffleDP (r, targ, shuff->seq); /* diresidue shuffling */ else if (esl_opt_IsOn (go, "-k")) esl_rsq_CShuffleKmers(r, targ, kmers, shuff->seq); /* diresidue shuffling */ else if (esl_opt_GetBoolean(go, "-0")) esl_rsq_CMarkov0 (r, targ, shuff->seq); /* 0th order Markov */ else if (esl_opt_GetBoolean(go, "-1")) esl_rsq_CMarkov1 (r, targ, shuff->seq); /* 1st order Markov */ else if (esl_opt_GetBoolean(go, "-r")) esl_rsq_CReverse ( targ, shuff->seq); /* reverse */ else if (esl_opt_IsOn (go, "-w")) { /* regionally shuffle */ int W= esl_opt_GetInteger(go, "-w"); esl_rsq_CShuffleWindows(r, targ, W, shuff->seq); } /* Set the name of the shuffled sequence */ if (N > 1) esl_sq_FormatName(shuff, "%s-shuffled-%d", sq->name, i); else esl_sq_FormatName(shuff, "%s-shuffled", sq->name); /* Output the resulting sequence */ esl_sqio_Write(ofp, shuff, outfmt, FALSE); /* don't need to reuse the shuffled sequence: we will use exactly the same memory */ } esl_sq_Reuse(sq); } if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n", sqfp->filename, esl_sqfile_GetErrorBuf(sqfp)); else if (status != eslEOF) esl_fatal("Unexpected error %d reading sequence file %s", status, sqfp->filename); if (L>0) free(targ); esl_sq_Destroy(shuff); esl_sq_Destroy(sq); esl_sqfile_Close(sqfp); return eslOK; ERROR: if (targ != NULL) free(targ); esl_sq_Destroy(shuff); esl_sq_Destroy(sq); esl_sqfile_Close(sqfp); return status; }
int main(int argc, char **argv) { ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage); char *msafile = esl_opt_GetArg(go, 1); int infmt = eslMSAFILE_UNKNOWN; int outfmt = eslMSAFILE_UNKNOWN; ESL_ALPHABET *abc = NULL; ESL_MSAFILE *afp = NULL; ESL_MSA *msa = NULL; int nali = 0; int status; /* Alphabet specification from cmdline? */ if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA); else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA); /* MSA input file format from cmdline? */ if (esl_opt_IsOn(go, "--informat") && (infmt = esl_msafile_EncodeFormat(esl_opt_GetString(go, "--informat"))) == eslMSAFILE_UNKNOWN) esl_fatal("Your --informat, %s, is not a recognized multiple alignment file format", esl_opt_GetString(go, "--informat")); /* Open in digital mode. Autoguess alphabet, format if we haven't set them already. */ if (( status = esl_msafile_Open(&abc, msafile, NULL, infmt, NULL, &afp)) != eslOK) esl_msafile_OpenFailure(afp, status); /* Reverse complementation only makes sense for alphabets that have abc->complement set */ if (! abc->complement) esl_fatal("File %s appears to use the %s alphabet.\nThat alphabet cannot be reverse complemented.\n", msafile, esl_abc_DecodeType(abc->type)); /* Set the output format, if requested. * By default, write in the same format we read in. * Remember, it's afp->format that gets set; infmt was only a hint. */ if ( esl_opt_IsOn(go, "--outformat") ) { outfmt = esl_msafile_EncodeFormat(esl_opt_GetString(go, "--outformat")); if (outfmt == eslMSAFILE_UNKNOWN) esl_fatal("Your --outformat, %s, is not a recognized multiple alignment file format.\n", esl_opt_GetString(go, "--outformat")); } else outfmt = afp->format; /* Here we go. */ while ((status = esl_msafile_Read(afp, &msa)) == eslOK) { nali++; status = esl_msa_ReverseComplement(msa); esl_msafile_Write(stdout, msa, outfmt); esl_msa_Destroy(msa); } if (nali == 0) esl_fatal("No alignments found in input file %s\n", msafile); if (status != eslEOF) esl_msafile_ReadFailure(afp, status); esl_msafile_Close(afp); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return 0; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* application configuration */ char *alifile = NULL; /* alignment file name */ int infmt = eslMSAFILE_UNKNOWN; /* format code for alifile */ int outfmt = eslMSAFILE_UNKNOWN; /* output format for fetched msa's */ ESLX_MSAFILE *afp = NULL; /* open alignment file */ FILE *ofp = NULL; /* output stream for alignments */ int status; /* easel return code */ /*********************************************** * Parse command line ***********************************************/ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) cmdline_failure(argv[0], "Error in configuration: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h") ) cmdline_help (argv[0], go); if (esl_opt_ArgNumber(go) < 1) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n"); if (esl_opt_IsOn(go, "--informat")) { infmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--informat")); if (infmt == eslMSAFILE_UNKNOWN) esl_fatal("%s is not a valid input alignment file format for --informat", esl_opt_GetString(go, "--informat")); } outfmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--outformat")); if (outfmt == eslMSAFILE_UNKNOWN) esl_fatal("%s is not a valid output alignment file format for --outformat", esl_opt_GetString(go, "--outformat")); alifile = esl_opt_GetArg(go, 1); /* Open the alignment file. */ if ( (status = eslx_msafile_Open(NULL, alifile, NULL, infmt, NULL, &afp)) != eslOK) eslx_msafile_OpenFailure(afp, status); /* Open the SSI index, if any */ if (! esl_opt_GetBoolean(go, "--index")) { if (afp->bf->mode_is == eslBUFFER_FILE || afp->bf->mode_is == eslBUFFER_ALLFILE || afp->bf->mode_is == eslBUFFER_MMAP) { char *ssifile = NULL; esl_sprintf(&ssifile, "%s.ssi", afp->bf->filename); status = esl_ssi_Open(ssifile, &(afp->ssi)); if (status == eslERANGE ) esl_fatal("SSI index %s has 64-bit offsets; this system doesn't support them", ssifile); else if (status == eslEFORMAT) esl_fatal("SSI index %s has an unrecognized format. Try recreating, w/ esl-afetch --index", ssifile); else if (status == eslENOTFOUND) afp->ssi = NULL; else if (status != eslOK) esl_fatal("SSI index %s: open failed, error code %d\n", ssifile, status); free(ssifile); } } /* Open the output file, if any */ if (esl_opt_GetBoolean(go, "-O")) { if ((ofp = fopen(esl_opt_GetArg(go, 2), "w")) == NULL) esl_fatal("Failed to open output file %s\n", esl_opt_GetArg(go, 2)); } else if (esl_opt_GetString(go, "-o") != NULL) { if ((ofp = fopen(esl_opt_GetString(go, "-o"), "w")) == NULL) esl_fatal("Failed to open output file %s\n", esl_opt_GetString(go, "-o")); } else ofp = stdout; /* Hand off control flow as appropriate */ if (esl_opt_GetBoolean(go, "--index")) { if (esl_opt_ArgNumber(go) != 1) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n"); create_ssi_index(go, afp); } else if (esl_opt_GetBoolean(go, "-f")) { if (esl_opt_ArgNumber(go) != 2) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n"); multifetch(go, ofp, outfmt, esl_opt_GetArg(go, 2), afp); } else { if (esl_opt_ArgNumber(go) != 2) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n"); onefetch(go, ofp, outfmt, esl_opt_GetArg(go, 2), afp); if (ofp != stdout) printf("\n\nRetrieved alignment %s.\n", esl_opt_GetArg(go, 2)); } eslx_msafile_Close(afp); esl_getopts_Destroy(go); exit(0); }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; ESL_STOPWATCH *w = esl_stopwatch_Create(); struct cfg_s cfg; p7_Init(); /* Process command line options. */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK || esl_opt_VerifyConfig(go) != eslOK) { printf("Failed to parse command line: %s\n", go->errbuf); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if (esl_opt_GetBoolean(go, "-h") == TRUE) { p7_banner(stdout, argv[0], banner); esl_usage(stdout, argv[0], usage); puts("\nCommon options:"); esl_opt_DisplayHelp(stdout, go, 1, 2, 80); /* 1=docgroup, 2 = indentation; 80=textwidth*/ puts("\nChoice of score type :"); esl_opt_DisplayHelp(stdout, go, 2, 2, 80); puts("\nChoice of alignment mode :"); esl_opt_DisplayHelp(stdout, go, 3, 2, 80); puts("\nChoice of multi vs. unihit config :"); esl_opt_DisplayHelp(stdout, go, 4, 2, 80); puts("\nChoice of generic vs. vector DP implementation :"); esl_opt_DisplayHelp(stdout, go, 5, 2, 80); puts("\nOutput options (use only in serial mode, for single HMM input):"); esl_opt_DisplayHelp(stdout, go, 6, 2, 80); puts("\nControlling range of fitted tail masses :"); esl_opt_DisplayHelp(stdout, go, 7, 2, 80); puts("\nRecalibrating E-values, and replacing HMM's existing parameters :"); esl_opt_DisplayHelp(stdout, go, 8, 2, 80); puts("\nDebugging :"); esl_opt_DisplayHelp(stdout, go, 9, 2, 80); puts("\nExperiments :"); esl_opt_DisplayHelp(stdout, go, 10, 2, 80); exit(0); } if (esl_opt_ArgNumber(go) != 1) { puts("Incorrect number of command line arguments."); esl_usage(stdout, argv[0], usage); puts("\nwhere general options are:"); esl_opt_DisplayHelp(stdout, go, 1, 2, 80); /* 1=docgroup, 2 = indentation; 80=textwidth*/ printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } /* Validate combinations of score/config/implementation (4x3x2x2, score x mode x hit x DPimpl: 20 of 48 possible combos valid */ if (esl_opt_GetBoolean(go, "--vector") && ! esl_opt_GetBoolean(go, "--local")) p7_Fail("SIMD vector implementations only work for local alignment."); /* -16/48 */ if (esl_opt_GetBoolean(go, "--msv") && ! esl_opt_GetBoolean(go, "--local")) p7_Fail("MSV scoring is local by definition: use --local."); /* -4/48 */ if (esl_opt_GetBoolean(go, "--vit") && ! esl_opt_GetBoolean(go, "--local")) p7_Fail("no p7_GViterbiDual for new dual-mode profile implemented yet"); /* -4/48 */ /* Initialize configuration shared across all kinds of masters * and workers in this .c file. */ cfg.hmmfile = esl_opt_GetArg(go, 1); cfg.r = esl_randomness_Create(esl_opt_GetInteger(go, "--seed")); // cfg.abc = esl_alphabet_Create(eslAMINO); cfg.my_rank = 0; /* MPI init will change this soon, if --mpi was set */ cfg.nproc = 0; /* MPI init will change this soon, if --mpi was set */ cfg.do_mpi = FALSE; /* --mpi will change this soon (below) if necessary */ cfg.do_stall = esl_opt_GetBoolean(go, "--stall"); cfg.N = esl_opt_GetInteger(go, "-N"); cfg.L = esl_opt_GetInteger(go, "-L"); cfg.hfp = NULL; cfg.ofp = NULL; cfg.survfp = NULL; cfg.efp = NULL; cfg.ffp = NULL; cfg.xfp = NULL; cfg.alfp = NULL; cfg.bg = NULL; /* This is our stall point, if we need to wait until we get a * debugger attached to this process for debugging (especially * useful for MPI): */ while (cfg.do_stall); /* Start timing. */ esl_stopwatch_Start(w); /* Main body: * Handed off to serial version or MPI masters and workers as appropriate. */ #ifdef HAVE_MPI if (esl_opt_GetBoolean(go, "--mpi")) { /* Initialize MPI, figure out who we are, and whether we're running * this show (proc 0) or working in it (procs >0). */ cfg.do_mpi = TRUE; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &(cfg.my_rank)); MPI_Comm_size(MPI_COMM_WORLD, &(cfg.nproc)); if (cfg.my_rank == 0 && cfg.nproc < 2) p7_Fail("Need at least 2 MPI processes to run --mpi mode."); if (cfg.my_rank > 0) mpi_worker(go, &cfg); else mpi_master(go, &cfg); esl_stopwatch_Stop(w); esl_stopwatch_MPIReduce(w, 0, MPI_COMM_WORLD); MPI_Finalize(); /* both workers and masters reach this line */ } else #endif /*HAVE_MPI*/ { /* No MPI? Then we're just the serial master. */ serial_master(go, &cfg); esl_stopwatch_Stop(w); } /* Stop timing. */ if (cfg.my_rank == 0) esl_stopwatch_Display(stdout, w, "# CPU time: "); /* Clean up and exit. */ if (cfg.my_rank == 0) { if (cfg.hfp != NULL) p7_hmmfile_Close(cfg.hfp); if (esl_opt_IsOn(go, "-o")) fclose(cfg.ofp); if (cfg.survfp != NULL) fclose(cfg.survfp); if (cfg.efp != NULL) fclose(cfg.efp); if (cfg.ffp != NULL) fclose(cfg.ffp); if (cfg.xfp != NULL) fclose(cfg.xfp); if (cfg.alfp != NULL) fclose(cfg.alfp); } p7_bg_Destroy(cfg.bg); esl_alphabet_Destroy(cfg.abc); esl_randomness_Destroy(cfg.r); esl_getopts_Destroy(go); esl_stopwatch_Destroy(w); return eslOK; }
int main(int argc, char **argv) { ESL_GETOPTS *go; /* application configuration */ int kstatus, tstatus;/* return code from Easel routine */ int fmt; /* expected format of kfile, tfile */ char *kfile, *tfile; /* known, test structure file */ ESL_MSAFILE *kfp, *tfp; /* open kfile, tfile */ ESL_MSA *ka, *ta; /* known, trusted alignment */ int64_t klen, tlen; /* lengths of dealigned seqs */ int i; /* counter over sequences */ int apos; /* counter over alignment columns */ int rfpos; /* counter over consensus (non-gap RF) columns */ int is_rfpos; /* TRUE if current apos is a consensus pos, FALSE if not */ int uapos; /* counter over unaligned residue positions */ int nali; /* number of alignment we're on in each file */ int **kp; /* [0..i..nseq-1][1..r..sq->n] = x known non-gap RF position of residue r in sequence i */ int **tp; /* [0..i..nseq-1][1..r..sq->n] = x predicted non-gap RF position of residue r in sequence i */ /* for both kp and pp, if x <= 0, residue r for seq i is not aligned to a non-gap RF position, but rather as an 'insert' * after non-gap RF position (x * -1) */ int *km_pos; /* [0..rflen] = x, in known aln, number of residues aligned to non-gap RF column x; special case: mct[0] = 0 */ int *ki_pos; /* [0..rflen] = x, in known aln, number of residues inserted after non-gap RF column x */ int *tm_pos; /* [0..rflen] = x, in predicted aln, number of residues aligned to non-gap RF column x; special case: mct[0] = 0 */ int *ti_pos; /* [0..rflen] = x, in predicted aln, number of residues inserted after non-gap RF column x */ int *cor_tm_pos; /* [0..rflen] = x, in predicted aln, number of correctly predicted residues aligned to non-gap RF column x; special case: mct[0] = 0 */ int *cor_ti_pos; /* [0..rflen] = x, in predicted aln, number of correctly predicted residues inserted after non-gap RF column x */ int *km_seq; /* [0..i..nseq-1] = x, in known aln, number of residues aligned to non-gap RF columns in seq i; */ int *ki_seq; /* [0..i..nseq-1] = x, in known aln, number of residues inserted in seq i */ int *tm_seq; /* [0..i..nseq-1] = x, in predicted aln, number of residues aligned to non-gap RF columns in seq i; */ int *ti_seq; /* [0..i..nseq-1] = x, in predicted aln, number of residues inserted in seq i */ int *cor_tm_seq; /* [0..i..nseq-1] = x, in predicted aln, number of correctly predicted residues aligned to non-gap RF columns in seq i */ int *cor_ti_seq; /* [0..i..nseq-1] = x, in predicted aln, number of correctly predicted residues inserted in seq i */ int *seqlen; /* [0..i..nseq-1] = x, unaligned seq i has length x */ ESL_ALPHABET *abc = NULL; /* alphabet for all alignments */ int rflen, t_rflen; /* non-gap RF length (consensus lengths) */ int status; char *namedashes; int ni; int namewidth = 8; /* length of 'seq name' */ int cor_tm, cor_ti, km, ki; /* correct predicted match, correct predicted insert, total match, total insert */ char *mask = NULL; int masklen; ESL_DSQ *ks; ESL_DSQ *ts; FILE *dfp = NULL; /* for --c2dfile */ /* variables needed for -p and related options */ int do_post = FALSE; /* TRUE if -p enabled */ int do_post_for_this_rfpos = FALSE; /* set for each consensus position, always TRUE unless --mask-p2xm */ int p; /* counter over integerized posteriors */ int *ptm = NULL; /* [0..p..10] number of total matches with posterior value p (10="*")*/ int *pti = NULL; /* [0..p..10] number of total inserts with posterior value p */ int *cor_ptm = NULL; /* [0..p..10] number of correct matches with posterior value p */ int *cor_pti = NULL; /* [0..p..10] number of correct inserts with posterior value p */ int npostvals = 11; /* number of posterior values 0-9, * */ int ppidx; /* index of PP */ char ppchars[11] = "0123456789*"; int cm_cor_ptm, cm_cor_pti, cm_ptm, cm_pti, cm_incor_ptm, cm_incor_pti; /* cumulative counts of posteriors */ // int tot_cor_ptm, tot_cor_pti, tot_ptm, tot_pti; /* total counts of posteriors */ // int tot_incor_ptm,tot_incor_pti; // SRE: commented out; don't seem to be used; need to silence compiler warning char errbuf[eslERRBUFSIZE]; /*********************************************** * Parse command line ***********************************************/ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK || esl_opt_VerifyConfig(go) != eslOK) { printf("Failed to parse command line: %s\n", go->errbuf); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if (esl_opt_GetBoolean(go, "-h") ) { esl_banner(stdout, argv[0], banner); esl_usage (stdout, argv[0], usage); puts("\n where options are:"); esl_opt_DisplayHelp(stdout, go, 1, 2, 80); esl_opt_DisplayHelp(stdout, go, 2, 2, 80); exit(EXIT_SUCCESS); } if (esl_opt_ArgNumber(go) != 2) { printf("Incorrect number of command line arguments.\n"); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } kfile = esl_opt_GetArg(go, 1); tfile = esl_opt_GetArg(go, 2); fmt = eslMSAFILE_STOCKHOLM; /*********************************************** * Open the two Stockholm files. ***********************************************/ if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO); else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA); if ( (kstatus = esl_msafile_Open(&abc, kfile, NULL, fmt, NULL, &kfp)) != eslOK) esl_msafile_OpenFailure(kfp, kstatus); if ( (tstatus = esl_msafile_Open(&abc, tfile, NULL, fmt, NULL, &tfp)) != eslOK) esl_msafile_OpenFailure(tfp, tstatus); do_post = esl_opt_GetBoolean(go, "-p"); /* read the mask file if --p-mask is enabled */ if(! esl_opt_IsDefault(go, "--p-mask")) { if((status = read_mask_file(esl_opt_GetString(go, "--p-mask"), errbuf, &mask, &masklen)) != eslOK) esl_fatal(errbuf); } /* open the c2dfile for output, if nec */ if (esl_opt_IsOn(go, "--c2dfile")) { if ((dfp = fopen(esl_opt_GetString(go, "--c2dfile"), "w")) == NULL) esl_fatal("Failed to open --c2dfile output file %s\n", esl_opt_GetString(go, "--c2dfile")); } /*********************************************** * Do alignment comparisons, one seq at a time; * this means looping over all seqs in all alignments. ***********************************************/ nali = 0; while ( (kstatus = esl_msafile_Read(kfp, &ka)) != eslEOF) { if ( kstatus != eslOK) esl_msafile_ReadFailure(kfp, kstatus); if ( (tstatus = esl_msafile_Read(tfp, &ta)) != eslOK) esl_msafile_ReadFailure(tfp, tstatus); nali++; if((nali > 1) && (esl_opt_IsOn(go, "--c2dfile"))) esl_fatal("--c2dfile is only meant for msafiles with single alignments"); /* Sanity check on alignment */ if (ka->nseq != ta->nseq) esl_fatal("trusted, test alignments don't have same seq #\n"); if (ka->rf == NULL) esl_fatal("trusted alignment has no reference annotation\n"); if (ta->rf == NULL) esl_fatal("test alignment has no reference annotation\n"); /* make sure the sequences are all identical */ ESL_ALLOC(seqlen, sizeof(int) * ka->nseq); for(i = 0; i < ka->nseq; i++) { if(strcmp(ka->sqname[i], ta->sqname[i]) != 0) esl_fatal("sequence %d of trusted alignment %s has different name than seq %d of predicted alignment %s\n", (i+1), ka->sqname[i], (i+1), ta->sqname[i]); ESL_ALLOC(ks, sizeof(ESL_DSQ) * (ka->alen+2)); memcpy(ks, ka->ax[i], (ka->alen+2) * sizeof(ESL_DSQ)); esl_abc_XDealign(ka->abc, ks, ka->ax[i], &klen); ESL_ALLOC(ts, sizeof(ESL_DSQ) * (ta->alen+2)); memcpy(ts, ta->ax[i], (ta->alen+2) * sizeof(ESL_DSQ)); esl_abc_XDealign(ta->abc, ts, ta->ax[i], &tlen); if (tlen != klen) esl_fatal("dealigned sequence mismatch, seq %d, when dealigned, is %d residues in the known alignment, but %d residues in the trusted alignment.", (i+1), klen, tlen); if (memcmp(ks, ts, sizeof(ESL_DSQ) * klen) != 0) esl_fatal("dealigned sequence mismatch, seq %d %s, when dealigned, are not identical.", (i+1), ka->sqname[i]); seqlen[i] = tlen; free(ks); free(ts); } /* determine non-gap RF length */ rflen = 0; for(apos = 1; apos <= ka->alen; apos++) { if((! esl_abc_CIsGap (ka->abc, ka->rf[apos-1])) && (! esl_abc_CIsMissing(ka->abc, ka->rf[apos-1]))) rflen++; } t_rflen = 0; for(apos = 1; apos <= ta->alen; apos++) { if((! esl_abc_CIsGap (ta->abc, ta->rf[apos-1])) && (! esl_abc_CIsMissing (ta->abc, ta->rf[apos-1]))) t_rflen++; } if(t_rflen != rflen) esl_fatal("Trusted alignment non-gap RF length (%d) != predicted alignment non-gap RF length (%d).\n", rflen, t_rflen); /* if -p, make sure the test alignment has posterior probabilities, and allocate our counters for correct/incorrect per post value */ if(do_post) { if(! esl_opt_IsDefault(go, "--p-mask")) { if(masklen != rflen) { esl_fatal("Length of mask in %s (%d) not equal to non-gap RF len of alignments (%d)\n", esl_opt_GetString(go, "--p-mask"), masklen, rflen); } } if(ta->pp == NULL) esl_fatal("-p requires \"#=GR PP\" annotation in the test alignment, but none exists"); ESL_ALLOC(ptm, sizeof(int) * npostvals); ESL_ALLOC(pti, sizeof(int) * npostvals); ESL_ALLOC(cor_ptm, sizeof(int) * npostvals); ESL_ALLOC(cor_pti, sizeof(int) * npostvals); esl_vec_ISet(ptm, npostvals, 0); esl_vec_ISet(pti, npostvals, 0); esl_vec_ISet(cor_ptm, npostvals, 0); esl_vec_ISet(cor_pti, npostvals, 0); } /* allocate and initialize our counters */ ESL_ALLOC(kp, sizeof(int *) * ka->nseq); ESL_ALLOC(tp, sizeof(int *) * ta->nseq); for(i = 0; i < ka->nseq; i++) { ESL_ALLOC(kp[i], sizeof(int) * (seqlen[i]+1)); ESL_ALLOC(tp[i], sizeof(int) * (seqlen[i]+1)); esl_vec_ISet(kp[i], seqlen[i]+1, -987654321); esl_vec_ISet(tp[i], seqlen[i]+1, -987654321); } ESL_ALLOC(km_pos, sizeof(int) * (rflen+1)); ESL_ALLOC(ki_pos, sizeof(int) * (rflen+1)); ESL_ALLOC(tm_pos, sizeof(int) * (rflen+1)); ESL_ALLOC(ti_pos, sizeof(int) * (rflen+1)); ESL_ALLOC(cor_tm_pos, sizeof(int) * (rflen+1)); ESL_ALLOC(cor_ti_pos, sizeof(int) * (rflen+1)); esl_vec_ISet(km_pos, rflen+1, 0); esl_vec_ISet(ki_pos, rflen+1, 0); esl_vec_ISet(tm_pos, rflen+1, 0); esl_vec_ISet(ti_pos, rflen+1, 0); esl_vec_ISet(cor_tm_pos, rflen+1, 0); esl_vec_ISet(cor_ti_pos, rflen+1, 0); ESL_ALLOC(km_seq, sizeof(int) * ka->nseq); ESL_ALLOC(ki_seq, sizeof(int) * ka->nseq); ESL_ALLOC(tm_seq, sizeof(int) * ka->nseq); ESL_ALLOC(ti_seq, sizeof(int) * ka->nseq); ESL_ALLOC(cor_tm_seq, sizeof(int) * ka->nseq); ESL_ALLOC(cor_ti_seq, sizeof(int) * ka->nseq); esl_vec_ISet(km_seq, ka->nseq, 0); esl_vec_ISet(ki_seq, ka->nseq, 0); esl_vec_ISet(tm_seq, ka->nseq, 0); esl_vec_ISet(ti_seq, ka->nseq, 0); esl_vec_ISet(cor_tm_seq, ka->nseq, 0); esl_vec_ISet(cor_ti_seq, ka->nseq, 0); /* determine non-gap RF location of each residue in known alignment */ for(i = 0; i < ka->nseq; i++) { uapos = rfpos = 0; for(apos = 1; apos <= ka->alen; apos++) { is_rfpos = FALSE; if((! esl_abc_CIsGap (ka->abc, ka->rf[apos-1])) && (! esl_abc_CIsMissing (ka->abc, ka->rf[apos-1]))) { rfpos++; is_rfpos = TRUE; } if(esl_abc_XIsResidue(ka->abc, ka->ax[i][apos])) { uapos++; kp[i][uapos] = (is_rfpos) ? rfpos : (-1 * rfpos); if(is_rfpos) { km_pos[rfpos]++; km_seq[i]++; } else { ki_pos[rfpos]++; ki_seq[i]++; } } } } /* determine non-gap RF location of each residue in predicted alignment */ for(i = 0; i < ta->nseq; i++) { uapos = rfpos = 0; for(apos = 1; apos <= ta->alen; apos++) { is_rfpos = FALSE; if((! esl_abc_CIsGap (abc, ta->rf[apos-1])) && (! esl_abc_CIsMissing (abc, ta->rf[apos-1]))) { rfpos++; is_rfpos = TRUE; if(do_post) { do_post_for_this_rfpos = (mask != NULL && mask[rfpos-1] == '0') ? FALSE : TRUE; } } if(esl_abc_XIsResidue(ta->abc, ta->ax[i][apos])) { uapos++; tp[i][uapos] = (is_rfpos) ? rfpos : (-1 * rfpos); if(do_post) { if(esl_abc_CIsGap(abc, ta->pp[i][(apos-1)])) esl_fatal("gap PP value for nongap residue: ali: %d seq: %d apos: %d\n", nali, i, apos); ppidx = get_pp_idx(abc, ta->pp[i][(apos-1)]); if(ppidx == -1) esl_fatal("unrecognized PP value (%c) for nongap residue: ali: %d seq: %d apos: %d\n", ta->pp[i][(apos-1)], nali, i, apos); } if(is_rfpos) { tm_pos[rfpos]++; tm_seq[i]++; if(do_post_for_this_rfpos) ptm[ppidx]++; } else { ti_pos[rfpos]++; ti_seq[i]++; if(do_post) pti[ppidx]++; } if(kp[i][uapos] == tp[i][uapos]) { /* correctly predicted this residue */ if(is_rfpos) { cor_tm_seq[i]++; cor_tm_pos[rfpos]++; if(do_post_for_this_rfpos) cor_ptm[ppidx]++; } else { cor_ti_seq[i]++; cor_ti_pos[rfpos]++; if(do_post) cor_pti[ppidx]++; } } } } } if((! (esl_opt_GetBoolean(go, "-c"))) && (! esl_opt_GetBoolean(go, "-p"))) { /* print per sequence statistics */ /* determine the longest name in msa */ for(ni = 0; ni < ka->nseq; ni++) namewidth = ESL_MAX(namewidth, strlen(ka->sqname[ni])); ESL_ALLOC(namedashes, sizeof(char) * namewidth+1); namedashes[namewidth] = '\0'; for(ni = 0; ni < namewidth; ni++) namedashes[ni] = '-'; printf("# %-*s %6s %28s %28s %28s\n", namewidth, "seq name", "len", "match columns", "insert columns", "all columns"); printf("# %-*s %6s %28s %28s %28s\n", namewidth, namedashes, "------", "----------------------------", "----------------------------", "----------------------------"); for(i = 0; i < ta->nseq; i++) { printf(" %-*s %6d %8d / %8d (%.3f) %8d / %8d (%.3f) %8d / %8d (%.3f)\n", namewidth, ka->sqname[i], seqlen[i], cor_tm_seq[i], km_seq[i], (km_seq[i] == 0) ? 0. : ((float) cor_tm_seq[i] / (float) km_seq[i]), cor_ti_seq[i], ki_seq[i], (ki_seq[i] == 0) ? 0. : ((float) cor_ti_seq[i] / (float) ki_seq[i]), (cor_tm_seq[i] + cor_ti_seq[i]), (km_seq[i] + ki_seq[i]), ((float) (cor_tm_seq[i] + cor_ti_seq[i]) / ((float) km_seq[i] + ki_seq[i]))); } cor_tm = esl_vec_ISum(cor_tm_seq, ka->nseq); cor_ti = esl_vec_ISum(cor_ti_seq, ka->nseq); km = esl_vec_ISum(km_seq, ka->nseq); ki = esl_vec_ISum(ki_seq, ka->nseq); printf("# %-*s %6s %28s %28s %28s\n", namewidth, namedashes, "-----", "----------------------------", "----------------------------", "----------------------------"); printf("# %-*s %6s %8d / %8d (%.3f) %8d / %8d (%.3f) %8d / %8d (%.3f)\n", namewidth, "*all*", "-", cor_tm, km, ((float) cor_tm / (float) km), cor_ti, ki, ((float) cor_ti / (float) ki), (cor_tm+cor_ti), (km+ki), (((float) (cor_tm + cor_ti))/ ((float) (km + ki)))); free(namedashes); for(i = 0; i < ka->nseq; i++) { free(kp[i]); free(tp[i]); } } else if(esl_opt_GetBoolean(go, "-c")) { /* print per column statistics */ printf("# %5s %20s %20s %20s\n", "rfpos", "match", "insert", "both"); printf("# %5s %20s %20s %20s\n", "-----", "--------------------", "--------------------", "--------------------"); for(rfpos = 0; rfpos <= rflen; rfpos++) { printf(" %5d %4d / %4d (%.3f) %4d / %4d (%.3f) %4d / %4d (%.3f)\n", rfpos, cor_tm_pos[rfpos], km_pos[rfpos], (km_pos[rfpos] == 0) ? 0. : ((float) cor_tm_pos[rfpos] / (float) km_pos[rfpos]), cor_ti_pos[rfpos], ki_pos[rfpos], (ki_pos[rfpos] == 0) ? 0. : ((float) cor_ti_pos[rfpos] / (float) ki_pos[rfpos]), (cor_tm_pos[rfpos] + cor_ti_pos[rfpos]), (km_pos[rfpos] + ki_pos[rfpos]), ((float) (cor_tm_pos[rfpos] + cor_ti_pos[rfpos]) / ((float) km_pos[rfpos] + ki_pos[rfpos]))); } } else if(do_post) { /* do posterior output */ if(mask == NULL) { printf("# %2s %29s %29s\n", "", " match columns ", " insert columns "); printf("# %2s %29s %29s\n", "", "-----------------------------", "-----------------------------") ; printf("# %2s %8s %8s %9s %8s %8s %9s\n", "PP", "ncorrect", "ntotal", "fractcor", "ncorrect", "ntotal", "fractcor"); printf("# %2s %8s %8s %9s %8s %8s %9s\n", "--", "--------", "--------", "---------", "--------", "--------", "---------"); } else { printf("# %2s %29s %29s\n", "", " match columns within mask ", " insert columns "); printf("# %2s %29s %29s\n", "", "-----------------------------", "-----------------------------") ; printf("# %2s %8s %8s %9s %8s %8s %9s\n", "PP", "ncorrect", "ntotal", "fractcor", "ncorrect", "ntotal", "fractcor"); printf("# %2s %8s %8s %9s %8s %8s %9s\n", "--", "--------", "--------", "---------", "--------", "--------", "---------"); } cm_ptm = cm_pti = cm_cor_ptm = cm_cor_pti = cm_incor_ptm = cm_incor_pti = 0; //tot_ptm = esl_vec_ISum(ptm, npostvals); //tot_pti = esl_vec_ISum(pti, npostvals); //tot_cor_ptm = esl_vec_ISum(cor_ptm, npostvals); //tot_cor_pti = esl_vec_ISum(cor_pti, npostvals); //tot_incor_ptm = tot_ptm - tot_cor_ptm; //tot_incor_pti = tot_pti - tot_cor_pti; for(p = (npostvals-1); p >= 0; p--) { cm_cor_ptm += cor_ptm[p]; cm_cor_pti += cor_pti[p]; cm_ptm += ptm[p]; cm_pti += pti[p]; cm_incor_ptm += ptm[p] - cor_ptm[p]; cm_incor_pti += pti[p] - cor_pti[p]; printf(" %2c %8d / %8d (%.5f) %8d / %8d (%.5f)\n", ppchars[p], cor_ptm[p], ptm[p], (ptm[p] == 0) ? 0. : (float) cor_ptm[p] / (float) ptm[p], cor_pti[p], pti[p], (pti[p] == 0) ? 0. : (float) cor_pti[p] / (float) pti[p]); } } /* handle --c2dfile */ if (dfp != NULL) { /* match stats, 4 fields, CMYK color values */ for(rfpos = 1; rfpos <= rflen; rfpos++) { if(km_pos[rfpos] == 0) { /* special case, no known alignment residues, a blank position */ fprintf(dfp, "%.3f %.3f %.3f %.3f\n", 0., 0., 0., 0.); } else { fprintf(dfp, "%.3f %.3f %.3f %.3f\n", 0., /* cyan */ 1. - ((float) cor_tm_pos[rfpos] / (float) km_pos[rfpos]), /* magenta, fraction incorrect */ 1. - ((float) km_pos[rfpos] / ta->nseq), /* yellow, 1 - fraction of seqs with residue in column */ 0.); } } fprintf(dfp, "//\n"); /* insert stats, 4 fields, CMYK color values */ rfpos = 0; /* special case, combine insert posn 0 and 1 together */ if(ki_pos[rfpos] == 0) { /* special case, no known alignment residues, a blank position */ fprintf(dfp, "%.3f %.3f %.3f %.3f\n", 0., 0., 0., 0.); } else { fprintf(dfp, "%.3f %.3f %.3f %.3f\n", 0., /* cyan */ 1. - ((float) (cor_ti_pos[0] + cor_ti_pos[1]) / ((float) (ki_pos[0] + ki_pos[1]))), /* magenta, fraction correct */ 0., 0.); } /* insert stats posn 2..rflen */ for(rfpos = 2; rfpos <= rflen; rfpos++) { if(ki_pos[rfpos] == 0) { /* special case, no known alignment residues, a blank position */ fprintf(dfp, "%.3f %.3f %.3f %.3f\n", 0., 0., 0., 0.); } else { fprintf(dfp, "%.3f %.3f %.3f %.3f\n", 0., /* cyan */ 1. - ((float) cor_ti_pos[rfpos] / (float) ki_pos[rfpos]), /* magenta, fraction correct */ 0., 0.); } } fprintf(dfp, "//\n"); } if(ptm != NULL) free(ptm); if(pti != NULL) free(pti); if(cor_ptm != NULL) free(cor_ptm); if(cor_ptm != NULL) free(cor_pti); free(kp); free(tp); free(km_seq); free(ki_seq); free(tm_seq); free(ti_seq); free(cor_tm_seq); free(cor_ti_seq); free(km_pos); free(ki_pos); free(tm_pos); free(ti_pos); free(cor_tm_pos); free(cor_ti_pos); free(seqlen); esl_msa_Destroy(ka); esl_msa_Destroy(ta); } if(mask != NULL) free(mask); if(dfp != NULL) { fclose(dfp); printf("# Draw file of per-column stats saved to file: %s\n", esl_opt_GetString(go, "--c2dfile")); } if(abc) esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); esl_msafile_Close(tfp); esl_msafile_Close(kfp); return 0; ERROR: return status; }
static int output_result(ESL_GETOPTS *go, struct cfg_s *cfg, char *errbuf, P7_HMM *hmm, double *scores, int *alilens) { ESL_HISTOGRAM *h = NULL; int i; double tailp; double x10; double mu, lambda, E10; double mufix, E10fix; double mufix2, E10fix2; double E10p; double almean, alvar; /* alignment length mean and variance (optional output) */ double pmu, plambda; int status; /* fetch statistical params from HMM for expected distribution */ if (esl_opt_GetBoolean(go, "--vit")) { pmu = hmm->evparam[p7_VMU]; plambda = hmm->evparam[p7_VLAMBDA]; } else if (esl_opt_GetBoolean(go, "--msv")) { pmu = hmm->evparam[p7_MMU]; plambda = hmm->evparam[p7_MLAMBDA]; } else if (esl_opt_GetBoolean(go, "--fwd")) { pmu = hmm->evparam[p7_FTAU]; plambda = hmm->evparam[p7_FLAMBDA]; } /* Optional output of scores/alignment lengths: */ if (cfg->xfp) fwrite(scores, sizeof(double), cfg->N, cfg->xfp); if (cfg->alfp) for (i = 0; i < cfg->N; i++) fprintf(cfg->alfp, "%d %.3f\n", alilens[i], scores[i]); if (esl_opt_GetBoolean(go, "-v")) for (i = 0; i < cfg->N; i++) printf("%.3f\n", scores[i]); /* optional "filter power" data file: <hmm name> <# seqs <= P threshold> <fraction of seqs <= P threshold> */ if (cfg->ffp) output_filter_power(go, cfg, errbuf, hmm, scores); /* Count the scores into a histogram object. */ if ((h = esl_histogram_CreateFull(-50., 50., 0.2)) == NULL) ESL_XFAIL(eslEMEM, errbuf, "allocation failed"); for (i = 0; i < cfg->N; i++) esl_histogram_Add(h, scores[i]); /* For viterbi, MSV, and hybrid, fit data to a Gumbel, either with known lambda or estimated lambda. */ if (esl_opt_GetBoolean(go, "--vit") || esl_opt_GetBoolean(go, "--msv")) { esl_histogram_GetRank(h, 10, &x10); tailp = 1.0; /* mu, lambda, E10 fields are for ML Gumbel fit to the observed data */ if (esl_gumbel_FitComplete(scores, cfg->N, &mu, &lambda) != eslOK) esl_fatal("gumbel complete data fit failed"); E10 = cfg->N * esl_gumbel_surv(x10, mu, lambda); /* mufix, E10fix fields: assume lambda = log2; fit an ML mu to the data */ if (esl_gumbel_FitCompleteLoc(scores, cfg->N, 0.693147, &mufix) != eslOK) esl_fatal("gumbel mu- (location-)only data fit failed for lambda = log2"); E10fix = cfg->N * esl_gumbel_surv(x10, mufix, 0.693147); /* mufix2, E10fix2 fields: assume H3's own lambda estimate; fit ML mu */ if (esl_gumbel_FitCompleteLoc(scores, cfg->N, plambda, &mufix2) != eslOK) esl_fatal("gumbel mu- (location-)only data fit failed for fitted lambda"); E10fix2 = cfg->N * esl_gumbel_surv(x10, mufix2, plambda); /* pmu, plambda, E10p: use H3 expectation estimates (pmu, plambda) */ E10p = cfg->N * esl_gumbel_surv(x10, pmu, plambda); fprintf(cfg->ofp, "%-20s %8.4f %8.4f %8.4f %8.4f %8.4f %8.4f %8.4f %8.4f %8.4f %8.4f %8.4f", hmm->name, tailp, mu, lambda, E10, mufix, E10fix, mufix2, E10fix2, pmu, plambda, E10p); if (esl_opt_GetBoolean(go, "-a")) { esl_stats_IMean(alilens, cfg->N, &almean, &alvar); fprintf(cfg->ofp, " %8.4f %8.4f\n", almean, sqrt(alvar)); } else fprintf(cfg->ofp, "\n"); if (cfg->survfp != NULL) { double xmax = esl_opt_IsOn(go, "--xmax") ? esl_opt_GetReal(go, "--xmax") : h->xmax + 5.; esl_histogram_PlotSurvival(cfg->survfp, h); esl_gumbel_Plot(cfg->survfp, pmu, plambda, esl_gumbel_surv, h->xmin - 5., xmax, 0.1); esl_gumbel_Plot(cfg->survfp, mu, lambda, esl_gumbel_surv, h->xmin - 5., xmax, 0.1); esl_gumbel_Plot(cfg->survfp, mufix, 0.693147, esl_gumbel_surv, h->xmin - 5., xmax, 0.1); } if (cfg->efp != NULL) { double x; fprintf(cfg->efp, "# %s\n", hmm->name); for (i = 1; i <= 1000 && i <= cfg->N; i++) { esl_histogram_GetRank(h, i, &x); fprintf(cfg->efp, "%d %g\n", i, cfg->N * esl_gumbel_surv(x, pmu, plambda)); } fprintf(cfg->efp, "&\n"); } } /* For Forward, fit tail to exponential tails, for a range of tail mass choices. */ else if (esl_opt_GetBoolean(go, "--fwd")) { double tmin = esl_opt_GetReal(go, "--tmin"); double tmax = esl_opt_GetReal(go, "--tmax"); double tpoints = (double) esl_opt_GetInteger(go, "--tpoints"); int do_linear = esl_opt_GetBoolean(go, "--tlinear"); double *xv; double tau; int n; esl_histogram_GetRank(h, 10, &x10); tailp = tmin; do { if (tailp > 1.0) tailp = 1.0; esl_histogram_GetTailByMass(h, tailp, &xv, &n, NULL); if (esl_exp_FitComplete(xv, n, &mu, &lambda) != eslOK) esl_fatal("exponential fit failed"); E10 = cfg->N * tailp * esl_exp_surv(x10, mu, lambda); mufix = mu; E10fix = cfg->N * tailp * esl_exp_surv(x10, mu, 0.693147); E10p = cfg->N * esl_exp_surv(x10, pmu, plambda); /* the pmu is relative to a P=1.0 tail origin. */ tau = mu + log(tailp) / lambda; fprintf(cfg->ofp, "%-20s %8.4f %8.4f %8.4f %8.4f %8.4f %8.4f %8.4f %8.4f %8.4f\n", hmm->name, tailp, mu, lambda, E10, mufix, E10fix, pmu, plambda, E10p); if (tpoints == 1) break; else if (do_linear) tailp += (tmax-tmin) / (tpoints-1); else tailp *= exp(log(tmax/tmin) / (tpoints-1)); } while (tailp <= tmax+1e-7); if (cfg->survfp) { double xmax = esl_opt_IsOn(go, "--xmax") ? esl_opt_GetReal(go, "--xmax") : h->xmax + 5.; esl_histogram_PlotSurvival(cfg->survfp, h); esl_exp_Plot(cfg->survfp, pmu, plambda, esl_exp_surv, pmu, xmax, 0.1); esl_exp_Plot(cfg->survfp, tau, lambda, esl_exp_surv, tau, xmax, 0.1); esl_exp_Plot(cfg->survfp, tau, 0.693147, esl_exp_surv, tau, xmax, 0.1); } if (cfg->efp != NULL) { double x; fprintf(cfg->efp, "# %s\n", hmm->name); for (i = 1; i <= 1000 && i <= cfg->N; i++) { esl_histogram_GetRank(h, i, &x); fprintf(cfg->efp, "%d %g\n", i, cfg->N * esl_exp_surv(x, pmu, plambda)); } fprintf(cfg->efp, "&\n"); } } /* fallthrough: both normal, error cases execute same cleanup code */ status = eslOK; ERROR: esl_histogram_Destroy(h); return status; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* command line processing */ ESL_STOPWATCH *w = esl_stopwatch_Create(); struct cfg_s cfg; /* Set processor specific flags */ impl_Init(); cfg.alifile = NULL; cfg.hmmfile = NULL; /* Parse the command line */ process_commandline(argc, argv, &go, &cfg.hmmfile, &cfg.alifile); /* Initialize what we can in the config structure (without knowing the alphabet yet) */ cfg.ofp = NULL; /* opened in init_master_cfg() */ cfg.fmt = eslMSAFILE_UNKNOWN; /* autodetect alignment format by default. */ cfg.afp = NULL; /* created in init_master_cfg() */ cfg.abc = NULL; /* created in init_master_cfg() in masters, or in mpi_worker() in workers */ cfg.hmmfp = NULL; /* opened in init_master_cfg() */ cfg.postmsafile = esl_opt_GetString(go, "-O"); /* NULL by default */ cfg.postmsafp = NULL; /* opened in init_master_cfg() */ cfg.nali = 0; /* this counter is incremented in masters */ cfg.nnamed = 0; /* 0 or 1 if a single MSA; == nali if multiple MSAs */ cfg.do_mpi = FALSE; /* this gets reset below, if we init MPI */ cfg.nproc = 0; /* this gets reset below, if we init MPI */ cfg.my_rank = 0; /* this gets reset below, if we init MPI */ cfg.do_stall = esl_opt_GetBoolean(go, "--stall"); cfg.hmmName = esl_opt_GetString(go, "-n"); /* NULL by default */ if (esl_opt_IsOn(go, "--informat")) { cfg.fmt = esl_msa_EncodeFormat(esl_opt_GetString(go, "--informat")); if (cfg.fmt == eslMSAFILE_UNKNOWN) p7_Fail("%s is not a recognized input sequence file format\n", esl_opt_GetString(go, "--informat")); } /* This is our stall point, if we need to wait until we get a * debugger attached to this process for debugging (especially * useful for MPI): */ while (cfg.do_stall); /* Start timing. */ esl_stopwatch_Start(w); /* Figure out who we are, and send control there: * we might be an MPI master, an MPI worker, or a serial program. */ #ifdef HAVE_MPI if (esl_opt_GetBoolean(go, "--mpi")) { cfg.do_mpi = TRUE; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &(cfg.my_rank)); MPI_Comm_size(MPI_COMM_WORLD, &(cfg.nproc)); if (cfg.my_rank > 0) mpi_worker(go, &cfg); else mpi_master(go, &cfg); esl_stopwatch_Stop(w); esl_stopwatch_MPIReduce(w, 0, MPI_COMM_WORLD); MPI_Finalize(); } else #endif /*HAVE_MPI*/ { serial_master(go, &cfg); esl_stopwatch_Stop(w); } if (cfg.my_rank == 0) { fputc('\n', cfg.ofp); esl_stopwatch_Display(cfg.ofp, w, "# CPU time: "); } /* Clean up the shared cfg. */ if (cfg.my_rank == 0) { if (esl_opt_IsOn(go, "-o")) { fclose(cfg.ofp); } if (cfg.afp != NULL) esl_msafile_Close(cfg.afp); if (cfg.abc != NULL) esl_alphabet_Destroy(cfg.abc); if (cfg.hmmfp != NULL) fclose(cfg.hmmfp); } esl_getopts_Destroy(go); esl_stopwatch_Destroy(w); return 0; }
int main(int argc, char **argv) { ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage); ESL_RANDOMNESS *rng = esl_randomness_Create(esl_opt_GetInteger(go, "-s")); double mu = esl_opt_GetReal (go, "-m"); double lambda = esl_opt_GetReal (go, "-l"); double tau = esl_opt_GetReal (go, "-t"); int n = esl_opt_GetInteger(go, "-n"); double binwidth = esl_opt_GetReal (go, "-w"); int plot_cdf = esl_opt_GetBoolean(go, "--cdf"); int plot_logcdf = esl_opt_GetBoolean(go, "--logcdf"); int plot_pdf = esl_opt_GetBoolean(go, "--pdf"); int plot_logpdf = esl_opt_GetBoolean(go, "--logpdf"); int plot_surv = esl_opt_GetBoolean(go, "--surv"); int plot_logsurv = esl_opt_GetBoolean(go, "--logsurv"); int be_verbose = esl_opt_GetBoolean(go, "-v"); char *plotfile = esl_opt_GetString (go, "-o"); ESL_HISTOGRAM *h = NULL; int xmin_set = esl_opt_IsOn(go, "--xL"); double xmin = xmin_set ? esl_opt_GetReal(go, "--xL") : mu; int xmax_set = esl_opt_IsOn(go, "--xH"); double xmax = xmax_set ? esl_opt_GetReal(go, "--xH") : mu+40*(1./lambda); int xstep_set = esl_opt_IsOn(go, "--xH"); double xstep = xstep_set ? esl_opt_GetReal(go, "--xS") : 0.1; FILE *pfp = stdout; double emu, elambda, etau; int i; double x; double *data; int ndata; fprintf(stderr, "## %s\n", argv[0]); fprintf(stderr, "# rng seed = %" PRIu32 "\n", esl_randomness_GetSeed(rng)); if (be_verbose) printf("Parametric: mu = %f lambda = %f tau = %f\n", mu, lambda, tau); h = esl_histogram_CreateFull(mu, 100., binwidth); if (plotfile && (pfp = fopen(plotfile, "w")) == NULL) ESL_EXCEPTION(eslFAIL, "Failed to open plotfile"); for (i = 0; i < n; i++) { x = esl_wei_Sample(rng, mu, lambda, tau); esl_histogram_Add(h, x); } esl_histogram_GetData(h, &data, &ndata); esl_wei_FitComplete(data, ndata, &emu, &elambda, &etau); if (be_verbose) printf("Complete data fit: mu = %f lambda = %f tau = %f\n", emu, elambda, etau); if (fabs( (emu-mu)/mu ) > 0.01) ESL_EXCEPTION(eslFAIL, "Error in (complete) fitted mu > 1%\n"); if (fabs( (elambda-lambda)/lambda ) > 0.10) ESL_EXCEPTION(eslFAIL, "Error in (complete) fitted lambda > 10%\n"); if (fabs( (etau-tau)/tau ) > 0.10) ESL_EXCEPTION(eslFAIL, "Error in (complete) fitted tau > 10%\n"); esl_wei_FitCompleteBinned(h, &emu, &elambda, &etau); if (be_verbose) printf("Binned data fit: mu = %f lambda = %f tau = %f\n", emu, elambda, etau); if (fabs( (emu-mu)/mu ) > 0.01) ESL_EXCEPTION(eslFAIL, "Error in (binned) fitted mu > 1%\n"); if (fabs( (elambda-lambda)/lambda ) > 0.10) ESL_EXCEPTION(eslFAIL, "Error in (binned) fitted lambda > 10%\n"); if (fabs( (etau-tau)/tau ) > 0.10) ESL_EXCEPTION(eslFAIL, "Error in (binned) fitted lambda > 10%\n"); if (plot_pdf) esl_wei_Plot(pfp, mu, lambda, tau, &esl_wei_pdf, xmin, xmax, xstep); if (plot_logpdf) esl_wei_Plot(pfp, mu, lambda, tau, &esl_wei_logpdf, xmin, xmax, xstep); if (plot_cdf) esl_wei_Plot(pfp, mu, lambda, tau, &esl_wei_cdf, xmin, xmax, xstep); if (plot_logcdf) esl_wei_Plot(pfp, mu, lambda, tau, &esl_wei_logcdf, xmin, xmax, xstep); if (plot_surv) esl_wei_Plot(pfp, mu, lambda, tau, &esl_wei_surv, xmin, xmax, xstep); if (plot_logsurv) esl_wei_Plot(pfp, mu, lambda, tau, &esl_wei_logsurv, xmin, xmax, xstep); if (plotfile) fclose(pfp); esl_histogram_Destroy(h); esl_randomness_Destroy(rng); esl_getopts_Destroy(go); fprintf(stderr, "# status = ok\n"); return 0; }
int main(int argc, char **argv) { int i,j; ESL_GETOPTS *go = NULL; /* command line processing */ ESL_STOPWATCH *w = esl_stopwatch_Create(); int status; ESL_MSA *msa = NULL; FILE *ofp = NULL; /* output file (default is stdout) */ ESL_ALPHABET *abc = NULL; /* digital alphabet */ char *alifile; /* name of the alignment file we're building HMMs from */ ESLX_MSAFILE *afp = NULL; /* open alifile */ int infmt = eslMSAFILE_UNKNOWN; /* autodetect alignment format by default. */ int outfmt = eslMSAFILE_STOCKHOLM; char *postmsafile; /* optional file to resave annotated, modified MSAs to */ FILE *postmsafp = NULL; /* open <postmsafile>, or NULL */ int mask_range_cnt = 0; uint32_t mask_starts[100]; // over-the-top allocation. uint32_t mask_ends[100]; char *rangestr; char *range; int *map = NULL; /* map[i]=j, means model position i comes from column j of the alignment; 1..alen */ int keep_mm; /* Set processor specific flags */ impl_Init(); alifile = NULL; postmsafile = NULL; /* Parse the command line */ process_commandline(argc, argv, &go, &alifile, &postmsafile); keep_mm = esl_opt_IsUsed(go, "--apendmask"); /* Initialize what we can in the config structure (without knowing the alphabet yet). * Fields controlled by masters are set up in usual_master() or mpi_master() * Fields used by workers are set up in mpi_worker() */ ofp = NULL; infmt = eslMSAFILE_UNKNOWN; afp = NULL; abc = NULL; if (esl_opt_IsOn(go, "--informat")) { infmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--informat")); if (infmt == eslMSAFILE_UNKNOWN) p7_Fail("%s is not a recognized input sequence file format\n", esl_opt_GetString(go, "--informat")); } /* Determine output alignment file format */ outfmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--outformat")); if (outfmt == eslMSAFILE_UNKNOWN) p7_Fail(argv[0], "%s is not a recognized output MSA file format\n", esl_opt_GetString(go, "--outformat")); /* Parse the ranges */ if (esl_opt_IsUsed(go, "--alirange")) { esl_strdup(esl_opt_GetString(go, "--alirange"), -1, &rangestr) ; } else if (esl_opt_IsUsed(go, "--modelrange")) { esl_strdup(esl_opt_GetString(go, "--modelrange"), -1, &rangestr) ; } else if (esl_opt_IsUsed(go, "--model2ali")) { esl_strdup(esl_opt_GetString(go, "--model2ali"), -1, &rangestr) ; } else if (esl_opt_IsUsed(go, "--ali2model")) { esl_strdup(esl_opt_GetString(go, "--ali2model"), -1, &rangestr) ; } else { if (puts("Must specify mask range with --modelrange, --alirange, --model2ali, or --ali2model\n") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto ERROR; } while ( (status = esl_strtok(&rangestr, ",", &range) ) == eslOK) { status = esl_regexp_ParseCoordString(range, mask_starts + mask_range_cnt, mask_ends + mask_range_cnt ); if (status == eslESYNTAX) esl_fatal("range flags take coords <from>..<to>; %s not recognized", range); if (status == eslFAIL) esl_fatal("Failed to find <from> or <to> coord in %s", range); mask_range_cnt++; } /* Start timing. */ esl_stopwatch_Start(w); /* Open files, set alphabet. * afp - open alignment file for input * abc - alphabet expected or guessed in ali file * postmsafp - open MSA output file * ofp - optional open output file, or stdout */ if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO); else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA); else abc = NULL; status = eslx_msafile_Open(&abc, alifile, NULL, infmt, NULL, &afp); if (status != eslOK) eslx_msafile_OpenFailure(afp, status); if (esl_opt_IsUsed(go, "--alirange") || esl_opt_IsUsed(go, "--modelrange") ) { postmsafp = fopen(postmsafile, "w"); if (postmsafp == NULL) p7_Fail("Failed to MSA output file %s for writing", postmsafile); } if (esl_opt_IsUsed(go, "-o")) { ofp = fopen(esl_opt_GetString(go, "-o"), "w"); if (ofp == NULL) p7_Fail("Failed to open -o output file %s\n", esl_opt_GetString(go, "-o")); } else ofp = stdout; /* Looks like the i/o is set up successfully... * Initial output to the user */ output_header(go, ofp, alifile, postmsafile); /* cheery output header */ /* read the alignment */ if ((status = eslx_msafile_Read(afp, &msa)) != eslOK) eslx_msafile_ReadFailure(afp, status); if (esl_opt_IsUsed(go, "--alirange") || esl_opt_IsUsed(go, "--modelrange") ) { /* add/modify mmline for the mask */ if (msa->mm == NULL) { ESL_ALLOC(msa->mm, msa->alen); keep_mm = FALSE; } if (!keep_mm) for (i=0; i<msa->alen; i++) msa->mm[i] = '.'; } // convert model coordinates to alignment coordinates, if necessary if (esl_opt_IsUsed(go, "--modelrange") || esl_opt_IsUsed(go, "--model2ali") || esl_opt_IsUsed(go, "--ali2model") ) { float symfrac = esl_opt_GetReal(go, "--symfrac"); int do_hand = esl_opt_IsOn(go, "--hand"); int L; //same as p7_builder relative_weights if (esl_opt_IsOn(go, "--wnone") ) { esl_vec_DSet(msa->wgt, msa->nseq, 1.); } else if (esl_opt_IsOn(go, "--wgiven") ) ; else if (esl_opt_IsOn(go, "--wpb") ) status = esl_msaweight_PB(msa); else if (esl_opt_IsOn(go, "--wgsc") ) status = esl_msaweight_GSC(msa); else if (esl_opt_IsOn(go, "--wblosum")) status = esl_msaweight_BLOSUM(msa, esl_opt_GetReal(go, "--wid")); if ((status = esl_msa_MarkFragments(msa, esl_opt_GetReal(go, "--fragthresh"))) != eslOK) goto ERROR; //build a map of model mask coordinates to alignment coords ESL_ALLOC(map, sizeof(int) * (msa->alen+1)); L = p7_Alimask_MakeModel2AliMap(msa, do_hand, symfrac, map ); if ( esl_opt_IsUsed(go, "--model2ali") ) { //print mapping printf ("model coordinates alignment coordinates\n"); for (i=0; i<mask_range_cnt; i++) printf ("%8d..%-8d -> %8d..%-8d\n", mask_starts[i], mask_ends[i], map[mask_starts[i]-1], map[mask_ends[i]-1]); /* If I wanted to, I could print all the map values independently: printf("\n\n-----------\n"); printf("Map\n"); printf("---\n"); for (i=0; i<L; i++) printf("%d -> %d\n", i+1, map[i]); */ } else if ( esl_opt_IsUsed(go, "--ali2model") ) { //print mapping (requires scanning the inverted map int alistart = 0; int aliend = 0; printf ("alignment coordinates model coordinates\n"); for (i=0; i<mask_range_cnt; i++) { //find j for ali positions while (map[alistart] < mask_starts[i] ) alistart++; aliend = alistart; while (map[aliend] < mask_ends[i] ) aliend++; printf (" %8d..%-8d -> %8d..%-8d\n", map[alistart], map[aliend], alistart+1, aliend+1); } } else { //convert the mask coords based on map for (i=0; i<mask_range_cnt; i++) { mask_starts[i] = map[mask_starts[i]-1]; //-1 because mmline is offset by one relative to the 1-base alignment mask_ends[i] = map[mask_ends[i]-1]; } } } if (esl_opt_IsUsed(go, "--alirange") || esl_opt_IsUsed(go, "--modelrange") ) { //overwrite '.' with 'm' everywhere the range says to do it for (i=0; i<mask_range_cnt; i++) for (j=mask_starts[i]; j<=mask_ends[i]; j++) msa->mm[j-1] = 'm'; if ((status = eslx_msafile_Write(postmsafp, msa, outfmt)) != eslOK) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); } esl_stopwatch_Stop(w); if (esl_opt_IsOn(go, "-o")) fclose(ofp); if (postmsafp) fclose(postmsafp); if (afp) eslx_msafile_Close(afp); if (abc) esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); esl_stopwatch_Destroy(w); return 0; ERROR: return eslFAIL; }
static int process_commandline(int argc, char **argv, ESL_GETOPTS **ret_go, char **ret_alifile, char **ret_postalifile) { ESL_GETOPTS *go = esl_getopts_Create(options); int status; if (esl_opt_ProcessEnvironment(go) != eslOK) { if (printf("Failed to process environment:\n%s\n", go->errbuf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; } if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) { if (printf("Failed to parse command line:\n%s\n", go->errbuf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; } if (esl_opt_VerifyConfig(go) != eslOK) { if (printf("Failed to parse command line:\n%s\n", go->errbuf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; } /* help format: */ if (esl_opt_GetBoolean(go, "-h") == TRUE) { p7_banner(stdout, argv[0], banner); esl_usage(stdout, argv[0], usage); if (puts("\nBasic options:") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); esl_opt_DisplayHelp(stdout, go, 1, 2, 80); if (puts("\nMask range options (format: --xxx 10-20,30-40 ) :") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); esl_opt_DisplayHelp(stdout, go, 5, 2, 80); if (puts("\nOptions for selecting alphabet rather than guessing it:") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); esl_opt_DisplayHelp(stdout, go, 2, 2, 80); if (puts("\nAlternative model construction strategies:") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); esl_opt_DisplayHelp(stdout, go, 3, 2, 80); if (puts("\nAlternative relative sequence weighting strategies:") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); esl_opt_DisplayHelp(stdout, go, 4, 2, 80); if (puts("\nOther options:") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); esl_opt_DisplayHelp(stdout, go, 8, 2, 80); exit(0); } if (esl_opt_ArgNumber(go) > 2) { if (puts("Incorrect number of command line arguments.") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; } if ((*ret_alifile = esl_opt_GetArg(go, 1)) == NULL) { if (puts("Failed to get <msafile> argument on command line") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; } if (esl_opt_IsUsed(go, "--alirange") || esl_opt_IsUsed(go, "--modelrange") ) { if ((*ret_postalifile = esl_opt_GetArg(go, 2)) == NULL) { if (puts("Failed to get <postmsafile> argument on command line") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; } } if (strcmp(*ret_alifile, "-") == 0 && ! esl_opt_IsOn(go, "--informat")) { if (puts("Must specify --informat to read <alifile> from stdin ('-')") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; } *ret_go = go; return eslOK; FAILURE: /* all errors handled here are user errors, so be polite. */ esl_usage(stdout, argv[0], usage); puts("\nwhere basic options are:"); esl_opt_DisplayHelp(stdout, go, 1, 2, 80); printf("\nTo see more help on other available options, do:\n %s -h\n\n", argv[0]); esl_getopts_Destroy(go); exit(1); ERROR: if (go) esl_getopts_Destroy(go); exit(status); }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; ESL_ALPHABET *nt_abc = esl_alphabet_Create(eslDNA); ESL_ALPHABET *aa_abc = esl_alphabet_Create(eslAMINO); ESL_GENCODE *gcode = NULL; ESL_GENCODE_WORKSTATE *wrk = NULL; char *seqfile = NULL; int informat = eslSQFILE_UNKNOWN; ESL_SQFILE *sqfp = NULL; int status; /***************************************************************** * command line parsing *****************************************************************/ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK || esl_opt_VerifyConfig(go) != eslOK) { printf("Failed to parse command line: %s\n", go->errbuf); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if (esl_opt_GetBoolean(go, "-h") ) { esl_banner(stdout, argv[0], banner); esl_usage (stdout, argv[0], usage); puts("\n where options are:"); esl_opt_DisplayHelp(stdout, go, /*docgroup=*/0, /*indent=*/2, /*textwidth=*/80); puts("\nAvailable NCBI genetic code tables (for -c <id>):"); esl_gencode_DumpAltCodeTable(stdout); exit(0); } if (esl_opt_ArgNumber(go) != 1) { printf("Incorrect number of command line arguments.\n"); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } seqfile = esl_opt_GetArg(go, 1); if (esl_opt_IsOn(go, "--informat") && (informat = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--informat"))) == eslMSAFILE_UNKNOWN) esl_fatal("%s is not a valid input sequence file format for --informat", esl_opt_GetString(go, "--informat")); status = esl_sqfile_OpenDigital(nt_abc, seqfile, informat, /*env=*/NULL, &sqfp); if (status == eslENOTFOUND) esl_fatal("Failed to find (or open) sequence file %s", seqfile); else if (status == eslEFORMAT) esl_fatal("Failed to recognize format of sequence file %s", seqfile); else if (status != eslOK) esl_fatal("Failure in opening sequence file %s; code %d", seqfile, status); /* A limitation. The esl_sqio_ReadWindow() interface needs to use SSI positioning * to read reverse complement, and that doesn't work on nonrewindable streams. */ if ( esl_opt_GetBoolean(go, "-W") && ! esl_sqfile_IsRewindable(sqfp) && ! esl_opt_GetBoolean(go, "--watson")) esl_fatal("esl-translate can't read reverse complement from a nonrewindable stream (stdin pipe, .gz file, etc)."); /* Set up the genetic code. Default = NCBI 1, the standard code; allow ORFs to start at any aa */ gcode = esl_gencode_Create(nt_abc, aa_abc); esl_gencode_Set(gcode, esl_opt_GetInteger(go, "-c")); // default = 1, the standard genetic code if (esl_opt_GetBoolean(go, "-m")) esl_gencode_SetInitiatorOnlyAUG(gcode); else if (! esl_opt_GetBoolean(go, "-M")) esl_gencode_SetInitiatorAny(gcode); // note this is the default, if neither -m or -M are set /* Set up the workstate structure, which contains both stateful * info about our position in <sqfp> and the DNA <sq>, as well as * one-time config info from options */ wrk = esl_gencode_WorkstateCreate(go, gcode); /* The two styles of main processing loop: */ if (esl_opt_GetBoolean(go, "-W")) do_by_windows(gcode, wrk, sqfp); else do_by_sequences(gcode, wrk, sqfp); esl_gencode_WorkstateDestroy(wrk); esl_sqfile_Close(sqfp); esl_gencode_Destroy(gcode); esl_alphabet_Destroy(aa_abc); esl_alphabet_Destroy(nt_abc); esl_getopts_Destroy(go); return 0; }
/* Function: main() * Synopsis: Run set of queries against an FM * Purpose: Read in a FM and a file of query sequences. * For each query, find matching FM interval, then collect positions in * the original text T for the corresponding occurrences. These positions * are 0-based (so first character is position 0). */ int main(int argc, char *argv[]) { void* tmp; // used for RALLOC calls clock_t t1, t2; struct tms ts1, ts2; char *fname_fm = NULL; char *fname_queries = NULL; FM_HIT *hits = NULL; char *line = NULL; int status = eslOK; int hit_cnt = 0; int hit_indiv_cnt = 0; int miss_cnt = 0; int hit_num = 0; int hit_num2 = 0; int hits_size = 0; int i,j; int count_only = 0; FM_INTERVAL interval; FM_DATA *fmsf = NULL; FM_DATA *fmsb = NULL; FILE* fp_fm = NULL; FILE* fp = NULL; FILE* out = NULL; char *outname = NULL; ESL_GETOPTS *go = NULL; /* command line processing */ FM_CFG *cfg; FM_METADATA *meta; ESL_SQ *tmpseq; // used for sequence validation ESL_ALPHABET *abc = NULL; //start timer t1 = times(&ts1); process_commandline(argc, argv, &go, &fname_fm, &fname_queries); if (esl_opt_IsOn(go, "--out")) { outname = esl_opt_GetString(go, "--out"); if ( esl_strcmp ("-", outname) == 0 ) { out = stdout; outname = "stdout"; } else { out = fopen(outname,"w"); } } if (esl_opt_IsOn(go, "--count_only")) count_only = 1; if((fp_fm = fopen(fname_fm, "rb")) == NULL) esl_fatal("Cannot open file `%s': ", fname_fm); fm_configAlloc(&cfg); cfg->occCallCnt = 0; meta = cfg->meta; meta->fp = fp_fm; fm_readFMmeta( meta); if (meta->alph_type == fm_DNA) abc = esl_alphabet_Create(eslDNA); else if (meta->alph_type == fm_AMINO) abc = esl_alphabet_Create(eslAMINO); tmpseq = esl_sq_CreateDigital(abc); //read in FM-index blocks ESL_ALLOC(fmsf, meta->block_count * sizeof(FM_DATA) ); if (!meta->fwd_only) ESL_ALLOC(fmsb, meta->block_count * sizeof(FM_DATA) ); for (i=0; i<meta->block_count; i++) { fm_FM_read( fmsf+i,meta, TRUE ); if (!meta->fwd_only) { fm_FM_read(fmsb+i, meta, FALSE ); fmsb[i].SA = fmsf[i].SA; fmsb[i].T = fmsf[i].T; } } fclose(fp_fm); output_header(meta, stdout, go, fname_fm, fname_queries); /* initialize a few global variables, then call initGlobals * to do architecture-specific initialization */ fm_configInit(cfg, NULL); fm_alphabetCreate(meta, NULL); // don't override charBits fp = fopen(fname_queries,"r"); if (fp == NULL) esl_fatal("Unable to open file %s\n", fname_queries); ESL_ALLOC(line, FM_MAX_LINE * sizeof(char)); hits_size = 200; ESL_ALLOC(hits, hits_size * sizeof(FM_HIT)); while(fgets(line, FM_MAX_LINE, fp) ) { int qlen=0; while (line[qlen] != '\0' && line[qlen] != '\n') qlen++; if (line[qlen] == '\n') line[qlen] = '\0'; hit_num = 0; for (i=0; i<meta->block_count; i++) { fm_getSARangeReverse(fmsf+i, cfg, line, meta->inv_alph, &interval); if (interval.lower>=0 && interval.lower <= interval.upper) { int new_hit_num = interval.upper - interval.lower + 1; hit_num += new_hit_num; if (!count_only) { if (hit_num > hits_size) { hits_size = 2*hit_num; ESL_RALLOC(hits, tmp, hits_size * sizeof(FM_HIT)); } getFMHits(fmsf+i, cfg, &interval, i, hit_num-new_hit_num, qlen, hits, fm_forward); } } /* find reverse hits, using backward search on the forward FM*/ if (!meta->fwd_only) { fm_getSARangeForward(fmsb+i, cfg, line, meta->inv_alph, &interval);// yes, use the backward fm to produce the equivalent of a forward search on the forward fm if (interval.lower>=0 && interval.lower <= interval.upper) { int new_hit_num = interval.upper - interval.lower + 1; hit_num += new_hit_num; if (!count_only) { if (hit_num > hits_size) { hits_size = 2*hit_num; ESL_RALLOC(hits, tmp, hits_size * sizeof(FM_HIT)); } //even though I used fmsb above, use fmsf here, since we'll now do a backward trace //in the FM-index to find the next sampled SA position getFMHits(fmsf+i, cfg, &interval, i, hit_num-new_hit_num, qlen, hits, fm_backward); } } } } if (hit_num > 0) { if (count_only) { hit_cnt++; hit_indiv_cnt += hit_num; } else { hit_num2 = 0; //for each hit, identify the sequence id and position within that sequence for (i = 0; i< hit_num; i++) { status = fm_getOriginalPosition (fmsf, meta, hits[i].block, hits[i].length, fm_forward, hits[i].start, &(hits[i].block), &(hits[i].start) ); hits[i].sortkey = (status==eslERANGE ? -1 : meta->seq_data[ hits[i].block ].target_id); //validate match - if any characters in orig sequence were ambiguities, reject fm_convertRange2DSQ( fmsf, meta, hits[i].start, hits[i].length, p7_NOCOMPLEMENT, tmpseq, TRUE ); for (j=1; j<=hits[i].length; j++) { if (tmpseq->dsq[j] >= abc->K) { hits[i].sortkey = -1; //reject j = hits[i].length+1; //quit looking } } if (hits[i].sortkey != -1) hit_num2++; // legitimate hit } if (hit_num2 > 0) hit_cnt++; //now sort according the the sequence_id corresponding to that seq_offset qsort(hits, hit_num, sizeof(FM_HIT), hit_sorter); //skim past the skipped entries i = 0; while ( i < hit_num ) { if (hits[i].sortkey != -1 ) break; // i++; } if (i < hit_num) { if (out != NULL) { fprintf (out, "%s\n",line); //fprintf (out, "\t%10s (%8d %s)\n",meta->seq_data[ hits[i].block ].name, hits[i].start, (hits[i].direction==fm_forward?"+":"-")); fprintf (out, " %8ld %s %10s\n", (long)(hits[i].start), (hits[i].direction==fm_forward?"f":"r"), meta->seq_data[ hits[i].block ].name); } hit_indiv_cnt++; i++; // skip the first one, since I'll be comparing each to the previous for ( ; i< hit_num; i++) { if ( //meta->seq_data[ hits[i].block ].id != meta->seq_data[ hits[i-1].block ].id || hits[i].sortkey != hits[i-1].sortkey || //sortkey is seq_data[].id hits[i].direction != hits[i-1].direction || hits[i].start != hits[i-1].start ) { if (out != NULL) //fprintf (out, "\t%10s (%8d %s)\n",meta->seq_data[ hits[i].block ].name, hits[i].start, (hits[i].direction==fm_forward?"+":"-")); fprintf (out, " %8ld %s %10s\n", (long)(hits[i].start), (hits[i].direction==fm_forward?"f":"r"), meta->seq_data[ hits[i].block ].name); hit_indiv_cnt++; } } if (out != NULL) fprintf (out, "\n"); } } } else { miss_cnt++; } } for (i=0; i<meta->block_count; i++) { fm_FM_destroy( fmsf+i, 1 ); if (!meta->fwd_only) fm_FM_destroy( fmsb+i, 0 ); } free (hits); free (line); fclose(fp); fm_configDestroy(cfg); // compute and print the elapsed time in millisec t2 = times(&ts2); { double clk_ticks = sysconf(_SC_CLK_TCK); double elapsedTime = (t2-t1)/clk_ticks; double throughput = cfg->occCallCnt/elapsedTime; fprintf (stderr, "hit: %-10d (%d)\n", hit_cnt, hit_indiv_cnt); fprintf (stderr, "miss:%-10d\n", miss_cnt); fprintf (stderr, "run time: %.2f seconds\n", elapsedTime); fprintf (stderr, "occ calls: %12s\n", commaprint(cfg->occCallCnt)); fprintf (stderr, "occ/sec: %12s\n", commaprint(throughput)); } exit(eslOK); ERROR: printf ("failure allocating memory for hits\n"); exit(status); }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; int i, j; int status = eslOK; P7_HMMFILE *hfp = NULL; /* open input HMM file */ P7_HMM *hmm = NULL; /* one HMM query */ ESL_ALPHABET *abc = NULL; /* digital alphabet */ P7_BG *bg = NULL; char errbuf[eslERRBUFSIZE]; char* hmmfile; float *rel_ents = NULL; float **heights = NULL; float **probs = NULL; float *ins_P = NULL; float *ins_expL = NULL; float *occupancy = NULL; int mode = HMMLOGO_RELENT_ALL; //default go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) esl_fatal(argv[0], "Failed to parse command line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) esl_fatal(argv[0], "Error in configuration: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h") ) { p7_banner (stdout, argv[0], banner); esl_usage (stdout, argv[0], usage); puts("\nOptions:"); esl_opt_DisplayHelp(stdout, go, 1, 2, 100); exit(0); } if (esl_opt_ArgNumber(go) != 1) esl_fatal(argv[0], "Incorrect number of command line arguments.\n"); hmmfile = esl_opt_GetArg(go, 1); if (esl_opt_IsOn(go, "--height_relent_all")) mode = HMMLOGO_RELENT_ALL; else if (esl_opt_IsOn(go, "--height_relent_abovebg")) mode = HMMLOGO_RELENT_ABOVEBG; else if (esl_opt_IsOn(go, "--height_score")) mode = HMMLOGO_SCORE; else mode = HMMLOGO_RELENT_ALL; //default /* Open the query profile HMM file */ status = p7_hmmfile_OpenE(hmmfile, NULL, &hfp, errbuf); if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status == eslEFORMAT) p7_Fail("File format problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, hmmfile, errbuf); status = p7_hmmfile_Read(hfp, &abc, &hmm); bg = p7_bg_Create(abc); ESL_ALLOC(rel_ents, (hmm->M+1) * sizeof(float)); ESL_ALLOC(heights, (hmm->M+1) * sizeof(float*)); ESL_ALLOC(probs, (hmm->M+1) * sizeof(float*)); for (i = 1; i <= hmm->M; i++) { ESL_ALLOC(heights[i], abc->K * sizeof(float)); ESL_ALLOC(probs[i], abc->K * sizeof(float)); } /* residue heights */ if (mode == HMMLOGO_RELENT_ALL) { printf ("max expected height = %.2f\n", hmmlogo_maxHeight(bg) ); hmmlogo_RelativeEntropy_all(hmm, bg, rel_ents, probs, heights); } else if (mode == HMMLOGO_RELENT_ABOVEBG) { printf ("max expected height = %.2f\n", hmmlogo_maxHeight(bg) ); hmmlogo_RelativeEntropy_above_bg(hmm, bg, rel_ents, probs, heights); } else if (mode == HMMLOGO_SCORE) { hmmlogo_ScoreHeights(hmm, bg, heights ); } printf ("Residue heights\n"); for (i = 1; i <= hmm->M; i++) { printf("%d: ", i); for (j=0; j<abc->K; j++) printf("%6.3f ", heights[i][j] ); if (mode != HMMLOGO_SCORE) printf(" (%6.3f)", rel_ents[i]); printf("\n"); } if (rel_ents != NULL) free(rel_ents); if (heights != NULL) { for (i = 1; i <= hmm->M; i++) if (heights[i] != NULL) free(heights[i]); free(heights); } /* indel values */ if (! esl_opt_IsOn(go, "--no_indel")) { ESL_ALLOC(ins_P, (hmm->M+1) * sizeof(float)); ESL_ALLOC(ins_expL, (hmm->M+1) * sizeof(float)); ESL_ALLOC(occupancy, (hmm->M+1) * sizeof(float)); hmmlogo_IndelValues(hmm, ins_P, ins_expL, occupancy); printf ("Indel values\n"); for (i = 1; i <= hmm->M; i++) printf("%d: %6.3f %6.3f %6.3f\n", i, ins_P[i], ins_expL[i], occupancy[i] ); free(ins_P); free(ins_expL); free(occupancy); } p7_hmmfile_Close(hfp); esl_alphabet_Destroy(abc); p7_bg_Destroy(bg); exit(0); ERROR: if (rel_ents != NULL) free(rel_ents); if (heights != NULL) { for (i = 1; i <= hmm->M; i++) if (heights[i] != NULL) free(heights[i]); free(heights); } if (hfp != NULL) p7_hmmfile_Close(hfp); if (abc != NULL) esl_alphabet_Destroy(abc); if (ins_P != NULL) free(ins_P); if (ins_expL != NULL) free(ins_expL); if (occupancy != NULL) free(occupancy); }
/* Function: main() * Synopsis: break input sequence set into chunks, for each one building the * Burrows-Wheeler transform and corresponding FM-index. Maintain requisite * meta data. * Notes: Currently depends on the divsufsort-lite code of Yuta Mori, though this * could easily be replaced. */ int main(int argc, char **argv) { char tmp_filename[16] = "fmtmpXXXXXX"; FILE *fptmp = NULL; FILE *fp = NULL; uint8_t *T = NULL; uint8_t *BWT = NULL; int *SA = NULL; //what I write will be 32-bit ints, but I need to keep this as int so it'll work with libdivsufsort uint32_t *SAsamp = NULL; uint32_t *occCnts_sb = NULL; // same indexing as above uint32_t *cnts_sb = NULL; uint16_t *occCnts_b = NULL; // this is logically a 2D array, but will be indexed as occ_cnts[alph_size*index + char] (instead of occ_cnts[index][char]) uint16_t *cnts_b = NULL; FM_METADATA *meta = NULL; clock_t t1, t2; struct tms ts1, ts2; long i,j,c; int status = eslOK; int chars_per_byte; int num_freq_cnts_sb ; int num_freq_cnts_b ; int num_SA_samples ; int infmt = eslSQFILE_UNKNOWN; int alphatype = eslUNKNOWN; int alphaguess =eslUNKNOWN; ESL_ALPHABET *abc = NULL; ESL_SQ *sq = NULL; ESL_SQFILE *sqfp = NULL; ESL_SQ *tmpsq = NULL; ESL_SQ_BLOCK *block = NULL; char *fname_in = NULL; char *fname_out= NULL; int block_size = 50000000; int sq_cnt = 0; int use_tmpsq = 0; uint64_t block_length; uint64_t total_char_count = 0; int max_block_size; int numblocks = 0; uint32_t numseqs = 0; int allocedseqs = 1000; uint32_t seq_offset = 0; uint32_t ambig_offset = 0; uint32_t overlap = 0; uint16_t seq_cnt; uint16_t ambig_cnt; uint32_t prev_numseqs = 0; int compressed_bytes; uint32_t term_loc; ESL_GETOPTS *go = NULL; /* command line processing */ uint8_t ambig_repl = 0; int in_ambig_run = 0; FM_AMBIGLIST ambig_list; ESL_ALLOC (meta, sizeof(FM_METADATA)); if (meta == NULL) esl_fatal("unable to allocate memory to store FM meta data\n"); ESL_ALLOC (meta->ambig_list, sizeof(FM_AMBIGLIST)); if (meta->ambig_list == NULL) esl_fatal("unable to allocate memory to store FM ambiguity data\n"); fm_initAmbiguityList(meta->ambig_list); meta->alph_type = fm_DNA; meta->freq_SA = 8; meta->freq_cnt_b = 256; meta->freq_cnt_sb = pow(2,16); //65536 - that's the # values in a short meta->seq_count = 0; ESL_ALLOC (meta->seq_data, allocedseqs * sizeof(FM_SEQDATA)); if (meta->seq_data == NULL ) esl_fatal("unable to allocate memory to store FM sequence data\n"); process_commandline(argc, argv, &go, &fname_in, &fname_out); if (esl_opt_IsOn(go, "--bin_length")) meta->freq_cnt_b = esl_opt_GetInteger(go, "--bin_length"); if ( meta->freq_cnt_b < 32 || meta->freq_cnt_b >4096 || (meta->freq_cnt_b & (meta->freq_cnt_b - 1)) ) // test power of 2 esl_fatal("bin_length must be a power of 2, at least 128, and at most 4096\n"); if (esl_opt_IsOn(go, "--sa_freq")) meta->freq_SA = esl_opt_GetInteger(go, "--sa_freq"); if ( (meta->freq_SA & (meta->freq_SA - 1)) ) // test power of 2 esl_fatal ("SA_freq must be a power of 2\n"); if (esl_opt_IsOn(go, "--block_size")) block_size = 1000000 * esl_opt_GetInteger(go, "--block_size"); if ( block_size <=0 ) esl_fatal ("block_size must be a positive number\n"); //start timer t1 = times(&ts1); output_header(stdout, go, fname_in, fname_out); if (esl_opt_GetString(go, "--informat") != NULL) { infmt = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--informat")); if (infmt == eslSQFILE_UNKNOWN) esl_fatal("%s is not a valid input sequence file format for --informat"); } status = esl_sqfile_Open(fname_in, infmt, NULL, &sqfp); if (status == eslENOTFOUND) esl_fatal("No such file %s", fname_in); else if (status == eslEFORMAT) esl_fatal("Format of seqfile %s unrecognized.", fname_in); else if (status != eslOK) esl_fatal("Open failed, code %d.", status); meta->fwd_only = 0; if (esl_opt_IsUsed(go, "--alph")) { meta->alph = esl_opt_GetString(go, "--alph") ; if ( esl_strcmp(meta->alph, "dna")==0 || esl_strcmp(meta->alph, "rna")==0) { meta->alph_type = fm_DNA; alphatype = eslDNA; } else if (esl_strcmp(meta->alph, "dna_full")==0 || esl_strcmp(meta->alph, "rna_full")==0) { meta->alph_type = fm_DNA_full; alphatype = eslDNA; } else if (esl_strcmp(meta->alph, "amino")==0) { meta->alph_type = fm_AMINO; alphatype = eslAMINO; meta->fwd_only = 1; } else { esl_fatal("Unknown alphabet type. Try 'dna', 'dna_full', or 'amino'\n%s", ""); } } else { esl_sqfile_GuessAlphabet(sqfp, &alphaguess); if (alphaguess == eslDNA || alphaguess == eslRNA) { meta->alph_type = fm_DNA; alphatype = eslDNA; } else if (alphaguess == eslAMINO) { meta->alph_type = fm_AMINO; alphatype = eslAMINO; meta->fwd_only = 1; } else { esl_fatal("Unknown alphabet type. Try 'dna', 'dna_full', or 'amino'\n%s", ""); } } if (esl_opt_IsOn(go, "--fwd_only") ) meta->fwd_only = 1; meta->alph = NULL; //getInverseAlphabet fm_alphabetCreate(meta, &(meta->charBits)); chars_per_byte = 8/meta->charBits; //shift inv_alph up one, to make space for '$' at 0 for (i=0; i<256; i++) if ( meta->inv_alph[i] >= 0) meta->inv_alph[i]++; abc = esl_alphabet_Create(alphatype); sq = esl_sq_CreateDigital(abc); tmpsq = esl_sq_CreateDigital(abc); esl_sqfile_SetDigital(sqfp, abc); block = esl_sq_CreateDigitalBlock(FM_BLOCK_COUNT, abc); block->complete = FALSE; // max_block_size = FM_BLOCK_OVERLAP+block_size+1 + block_size*.2; // +1 for the '$' max_block_size = FM_BLOCK_OVERLAP+block_size+1 + block_size; // temporary hack to avoid memory over-runs (see end of 1101_fmindex_benchmarking/00NOTES) if (alphatype == fm_DNA) fm_initAmbiguityList(&ambig_list); /* Allocate BWT, Text, SA, and FM-index data structures, allowing storage of maximally large sequence*/ ESL_ALLOC (T, max_block_size * sizeof(uint8_t)); ESL_ALLOC (BWT, max_block_size * sizeof(uint8_t)); ESL_ALLOC (SA, max_block_size * sizeof(int)); ESL_ALLOC (SAsamp, (1+floor((double)max_block_size/meta->freq_SA) ) * sizeof(uint32_t)); ESL_ALLOC (occCnts_sb, (1+ceil((double)max_block_size/meta->freq_cnt_sb)) * meta->alph_size * sizeof(uint32_t)); // every freq_cnt_sb positions, store an array of ints ESL_ALLOC (cnts_sb, meta->alph_size * sizeof(uint32_t)); ESL_ALLOC (occCnts_b, ( 1+ceil((double)max_block_size/meta->freq_cnt_b)) * meta->alph_size * sizeof(uint16_t)); // every freq_cnt_b positions, store an array of 8-byte ints ESL_ALLOC (cnts_b, meta->alph_size * sizeof(uint16_t)); if((T == NULL) || (BWT == NULL) || (SA==NULL) || (SAsamp==NULL) || (BWT==NULL) || (cnts_b==NULL) || (occCnts_b==NULL) || (cnts_sb==NULL) || (occCnts_sb==NULL) ) { esl_fatal( "%s: Cannot allocate memory.\n", argv[0]); } // Open a temporary file, to which FM-index data will be written if (esl_tmpfile(tmp_filename, &fptmp) != eslOK) esl_fatal("unable to open fm-index tmpfile"); /* Main loop: */ while (status == eslOK ) { //reset block as an empty vessel for (i=0; i<block->count; i++) esl_sq_Reuse(block->list + i); if (use_tmpsq) { esl_sq_Copy(tmpsq , block->list); block->complete = FALSE; //this lets ReadBlock know that it needs to append to a small bit of previously-read seqeunce block->list->C = FM_BLOCK_OVERLAP; // overload the ->C value, which ReadBlock uses to determine how much // overlap should be retained in the ReadWindow step } else { block->complete = TRUE; } status = esl_sqio_ReadBlock(sqfp, block, block_size, -1, alphatype != eslAMINO); if (status == eslEOF) continue; if (status != eslOK) ESL_XEXCEPTION(status, "failure reading sequence block"); seq_offset = numseqs; ambig_offset = meta->ambig_list->count; if (block->complete || block->count == 0) { use_tmpsq = FALSE; } else { /* The final sequence on the block was a probably-incomplete window of the active sequence. * Grab a copy of the end for use in the next pass, to ensure we don't miss hits crossing * the boundary between two blocks. */ esl_sq_Copy(block->list + (block->count - 1) , tmpsq); use_tmpsq = TRUE; } block->first_seqidx = sq_cnt; sq_cnt += block->count - (use_tmpsq ? 1 : 0);// if there's an incomplete sequence read into the block wait to count it until it's complete. /* Read dseqs from block into text element T. * Convert the dsq from esl-alphabet to fm-alphabet (1..k for alphabet of size k). * (a) collapsing upper/lower case for appropriate sorting. * (b) reserving 0 for '$', which must be lexicographically smallest * (these will later be shifted to 0-based alphabet, once SA has been built) * */ block_length = 0; for (i=0; i<block->count; i++) { //start a new block, with space for the name allocateSeqdata(meta, block->list+i, numseqs, &allocedseqs); //meta data meta->seq_data[numseqs].target_id = block->first_seqidx + i ; meta->seq_data[numseqs].target_start = block->list[i].start; meta->seq_data[numseqs].fm_start = block_length; if (block->list[i].name == NULL) meta->seq_data[numseqs].name[0] = '\0'; else strcpy(meta->seq_data[numseqs].name, block->list[i].name ); if (block->list[i].acc == NULL) meta->seq_data[numseqs].acc[0] = '\0'; else strcpy(meta->seq_data[numseqs].acc, block->list[i].acc ); if (block->list[i].source == NULL) meta->seq_data[numseqs].source[0] = '\0'; else strcpy(meta->seq_data[numseqs].source, block->list[i].source ); if (block->list[i].desc == NULL) meta->seq_data[numseqs].desc[0] = '\0'; else strcpy(meta->seq_data[numseqs].desc, block->list[i].desc ); for (j=1; j<=block->list[i].n; j++) { c = abc->sym[block->list[i].dsq[j]]; if ( meta->alph_type == fm_DNA) { if (meta->inv_alph[c] == -1) { // replace ambiguity characters by rotating through A,C,G, and T. c = meta->alph[ambig_repl]; ambig_repl = (ambig_repl+1)%4; if (!in_ambig_run) { fm_addAmbiguityRange(meta->ambig_list, block_length, block_length); in_ambig_run=1; } else { meta->ambig_list->ranges[meta->ambig_list->count - 1].upper = block_length; } } else { in_ambig_run=0; } } else if (meta->inv_alph[c] == -1) { esl_fatal("requested alphabet doesn't match input text\n"); } T[block_length] = meta->inv_alph[c]; block_length++; if (j>block->list[i].C) total_char_count++; // add to total count, only if it's not redundant with earlier read meta->seq_data[numseqs].length++; } numseqs++; } T[block_length] = 0; // last character 0 is effectively '$' for suffix array block_length++; seq_cnt = numseqs-seq_offset; ambig_cnt = meta->ambig_list->count - ambig_offset; //build and write FM-index for T. This will be a BWT on the reverse of the sequence, required for reverse-traversal of the BWT buildAndWriteFMIndex(meta, seq_offset, ambig_offset, seq_cnt, ambig_cnt, (uint32_t)block->list[0].C, T, BWT, SA, SAsamp, occCnts_sb, cnts_sb, occCnts_b, cnts_b, block_length, fptmp); if ( ! meta->fwd_only ) { //build and write FM-index for un-reversed T (used to find reverse hits using forward traversal of the BWT buildAndWriteFMIndex(meta, seq_offset, ambig_offset, seq_cnt, ambig_cnt, 0, T, BWT, SA, NULL, occCnts_sb, cnts_sb, occCnts_b, cnts_b, block_length, fptmp); } prev_numseqs = numseqs; numblocks++; } esl_sqfile_Close(sqfp); esl_alphabet_Destroy(abc); esl_sq_Destroy(sq); esl_sq_Destroy(tmpsq); esl_sq_DestroyBlock(block); meta->seq_count = numseqs; meta->block_count = numblocks; /* Finished writing the FM-index data to a temporary file. Now write * metadata to fname_out, than append FM-index data from temp file */ if((fp = fopen(fname_out, "wb")) == NULL) esl_fatal( "%s: Cannot open file `%s': ", argv[0], fname_out); //write out meta data if( fwrite(&(meta->fwd_only), sizeof(meta->fwd_only), 1, fp) != 1 || fwrite(&(meta->alph_type), sizeof(meta->alph_type), 1, fp) != 1 || fwrite(&(meta->alph_size), sizeof(meta->alph_size), 1, fp) != 1 || fwrite(&(meta->charBits), sizeof(meta->charBits), 1, fp) != 1 || fwrite(&(meta->freq_SA), sizeof(meta->freq_SA), 1, fp) != 1 || fwrite(&(meta->freq_cnt_sb), sizeof(meta->freq_cnt_sb), 1, fp) != 1 || fwrite(&(meta->freq_cnt_b), sizeof(meta->freq_cnt_b), 1, fp) != 1 || fwrite(&(meta->block_count), sizeof(meta->block_count), 1, fp) != 1 || fwrite(&(meta->seq_count), sizeof(meta->seq_count), 1, fp) != 1 || fwrite(&(meta->ambig_list->count), sizeof(meta->ambig_list->count), 1, fp) != 1 || fwrite(&total_char_count, sizeof(total_char_count), 1, fp) != 1 ) esl_fatal( "%s: Error writing meta data for FM index.\n", argv[0]); for (i=0; i<meta->seq_count; i++) { if( fwrite(&(meta->seq_data[i].target_id), sizeof(meta->seq_data[i].target_id), 1, fp) != 1 || fwrite(&(meta->seq_data[i].target_start), sizeof(meta->seq_data[i].target_start), 1, fp) != 1 || fwrite(&(meta->seq_data[i].fm_start), sizeof(meta->seq_data[i].fm_start), 1, fp) != 1 || fwrite(&(meta->seq_data[i].length), sizeof(meta->seq_data[i].length), 1, fp) != 1 || fwrite(&(meta->seq_data[i].name_length), sizeof(meta->seq_data[i].name_length), 1, fp) != 1 || fwrite(&(meta->seq_data[i].acc_length), sizeof(meta->seq_data[i].acc_length), 1, fp) != 1 || fwrite(&(meta->seq_data[i].source_length),sizeof(meta->seq_data[i].source_length), 1, fp) != 1 || fwrite(&(meta->seq_data[i].desc_length), sizeof(meta->seq_data[i].desc_length), 1, fp) != 1 || fwrite(meta->seq_data[i].name, sizeof(char), meta->seq_data[i].name_length+1 , fp) != meta->seq_data[i].name_length+1 || fwrite(meta->seq_data[i].acc, sizeof(char), meta->seq_data[i].acc_length+1 , fp) != meta->seq_data[i].acc_length+1 || fwrite(meta->seq_data[i].source, sizeof(char), meta->seq_data[i].source_length+1, fp) != meta->seq_data[i].source_length+1 || fwrite(meta->seq_data[i].desc, sizeof(char), meta->seq_data[i].desc_length+1 , fp) != meta->seq_data[i].desc_length+1 ) esl_fatal( "%s: Error writing meta data for FM index.\n", argv[0]); } for (i=0; i<meta->ambig_list->count; i++) { if( fwrite(&(meta->ambig_list->ranges[i].lower), sizeof(meta->ambig_list->ranges[i].lower), 1, fp) != 1 || fwrite(&(meta->ambig_list->ranges[i].upper), sizeof(meta->ambig_list->ranges[i].upper), 1, fp) != 1 ) esl_fatal( "%s: Error writing ambiguity data for FM index.\n", argv[0]); } /* now append the FM-index data in fptmp to the desired output file, fp */ rewind(fptmp); for (i=0; i<numblocks; i++) { for(j=0; j< (meta->fwd_only?1:2); j++ ) { //do this once or twice, once for forward-T index, and possibly once for reversed //first, read if(fread(&block_length, sizeof(block_length), 1, fptmp) != 1) esl_fatal( "%s: Error reading block_length in FM index.\n", argv[0]); if(fread(&term_loc, sizeof(term_loc), 1, fptmp) != 1) esl_fatal( "%s: Error reading terminal location in FM index.\n", argv[0]); if(fread(&seq_offset, sizeof(seq_offset), 1, fptmp) != 1) esl_fatal( "%s: Error reading seq_offset in FM index.\n", argv[0]); if(fread(&ambig_offset, sizeof(ambig_offset ), 1, fptmp) != 1) esl_fatal( "%s: Error reading ambig_offset in FM index.\n", argv[0]); if(fread(&overlap, sizeof(overlap), 1, fptmp) != 1) esl_fatal( "%s: Error reading overlap in FM index.\n", argv[0]); if(fread(&seq_cnt, sizeof(seq_cnt), 1, fptmp) != 1) esl_fatal( "%s: Error reading seq_cnt in FM index.\n", argv[0]); if(fread(&ambig_cnt, sizeof(ambig_cnt), 1, fptmp) != 1) esl_fatal( "%s: Error reading ambig_cnt in FM index.\n", argv[0]); compressed_bytes = ((chars_per_byte-1+block_length)/chars_per_byte); num_freq_cnts_b = 1+ceil((double)block_length/meta->freq_cnt_b); num_freq_cnts_sb = 1+ceil((double)block_length/meta->freq_cnt_sb); num_SA_samples = 1+floor((double)block_length/meta->freq_SA); //j==0 test cause T and SA to be written only for forward sequence if(j==0 && fread(T, sizeof(uint8_t), compressed_bytes, fptmp) != compressed_bytes) esl_fatal( "%s: Error reading T in FM index.\n", argv[0]); if(fread(BWT, sizeof(uint8_t), compressed_bytes, fptmp) != compressed_bytes) esl_fatal( "%s: Error reading BWT in FM index.\n", argv[0]); if(j==0 && fread(SAsamp, sizeof(uint32_t), (size_t)num_SA_samples, fptmp) != (size_t)num_SA_samples) esl_fatal( "%s: Error reading SA in FM index.\n", argv[0]); if(fread(occCnts_b, sizeof(uint16_t)*(meta->alph_size), (size_t)num_freq_cnts_b, fptmp) != (size_t)num_freq_cnts_b) esl_fatal( "%s: Error reading occCnts_b in FM index.\n", argv[0]); if(fread(occCnts_sb, sizeof(uint32_t)*(meta->alph_size), (size_t)num_freq_cnts_sb, fptmp) != (size_t)num_freq_cnts_sb) esl_fatal( "%s: Error reading occCnts_sb in FM index.\n", argv[0]); //then, write if(fwrite(&block_length, sizeof(block_length), 1, fp) != 1) esl_fatal( "%s: Error writing block_length in FM index.\n", argv[0]); if(fwrite(&term_loc, sizeof(term_loc), 1, fp) != 1) esl_fatal( "%s: Error writing terminal location in FM index.\n", argv[0]); if(fwrite(&seq_offset, sizeof(seq_offset), 1, fp) != 1) esl_fatal( "%s: Error writing seq_offset in FM index.\n", argv[0]); if(fwrite(&ambig_offset, sizeof(ambig_offset), 1, fp) != 1) esl_fatal( "%s: Error writing ambig_offset in FM index.\n", argv[0]); if(fwrite(&overlap, sizeof(overlap), 1, fp) != 1) esl_fatal( "%s: Error writing overlap in FM index.\n", argv[0]); if(fwrite(&seq_cnt, sizeof(seq_cnt), 1, fp) != 1) esl_fatal( "%s: Error writing seq_cnt in FM index.\n", argv[0]); if(fwrite(&ambig_cnt, sizeof(ambig_cnt), 1, fp) != 1) esl_fatal( "%s: Error writing ambig_cnt in FM index.\n", argv[0]); if(j==0 && fwrite(T, sizeof(uint8_t), compressed_bytes, fp) != compressed_bytes) esl_fatal( "%s: Error writing T in FM index.\n", argv[0]); if(fwrite(BWT, sizeof(uint8_t), compressed_bytes, fp) != compressed_bytes) esl_fatal( "%s: Error writing BWT in FM index.\n", argv[0]); if(j==0 && fwrite(SAsamp, sizeof(uint32_t), (size_t)num_SA_samples, fp) != (size_t)num_SA_samples) esl_fatal( "%s: Error writing SA in FM index.\n", argv[0]); if(fwrite(occCnts_b, sizeof(uint16_t)*(meta->alph_size), (size_t)num_freq_cnts_b, fp) != (size_t)num_freq_cnts_b) esl_fatal( "%s: Error writing occCnts_b in FM index.\n", argv[0]); if(fwrite(occCnts_sb, sizeof(uint32_t)*(meta->alph_size), (size_t)num_freq_cnts_sb, fp) != (size_t)num_freq_cnts_sb) esl_fatal( "%s: Error writing occCnts_sb in FM index.\n", argv[0]); } } fprintf (stderr, "Number of characters in index: %ld\n", (long)total_char_count); fprintf (stderr, "Number of FM-index blocks: %ld\n", (long)meta->block_count); fclose(fp); fclose(fptmp); free(T); free(BWT); free(SA); free(SAsamp); free(occCnts_b); free(cnts_b); free(occCnts_sb); free(cnts_sb); fm_metaDestroy(meta); esl_getopts_Destroy(go); // compute and print the elapsed time in millisec t2 = times(&ts2); { double clk_ticks = sysconf(_SC_CLK_TCK); double elapsedTime = (t2-t1)/clk_ticks; fprintf (stderr, "run time: %.2f seconds\n", elapsedTime); } return (eslOK); ERROR: /* Deallocate memory. */ if (fp) fclose(fp); if (T) free(T); if (BWT) free(BWT); if (SA) free(SA); if (SAsamp) free(SAsamp); if (occCnts_b) free(occCnts_b); if (cnts_b) free(cnts_b); if (occCnts_sb) free(occCnts_sb); if (cnts_sb) free(cnts_sb); if (ambig_list.ranges) free(ambig_list.ranges); fm_metaDestroy(meta); esl_getopts_Destroy(go); esl_sqfile_Close(sqfp); esl_alphabet_Destroy(abc); esl_sq_Destroy(sq); if (tmpsq) esl_sq_Destroy(tmpsq); if (block) esl_sq_DestroyBlock(block); fprintf (stderr, "failure during memory allocation\n"); exit(status); }