static int output_header(FM_METADATA *meta, FILE *ofp, const ESL_GETOPTS *go, char *fmfile, char *qfile) { char *alph; char *appname = NULL; int status; if (meta->alph_type == fm_DNA) alph = "dna"; else if (meta->alph_type == fm_AMINO) alph = "amino"; if ((status = esl_FileTail(go->argv[0], FALSE, &appname)) != eslOK) return status; if (fprintf(ofp, "# %s :: %s\n", appname, banner) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); if (fprintf(ofp, "# %s\n", EASEL_COPYRIGHT) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); if (fprintf(ofp, "# %s\n", EASEL_LICENSE) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); if (fprintf(ofp, "# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); if (fprintf(ofp, "# input binary-formatted HMMER database: %s\n", fmfile) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (fprintf(ofp, "# input file of query sequences: %s\n", qfile) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--out")) { char *outfile = esl_opt_GetString(go, "--out"); if (fprintf(ofp, "# output file containing list of hits: %s\n", (esl_strcmp(outfile, "-") == 0 ? "stdout" : outfile)) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); } if (esl_opt_IsUsed(go, "--count_only") && fprintf(ofp, "# output only counts, not hit locations\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (fprintf(ofp, "# alphabet : %s\n", alph) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (fprintf(ofp, "# bin_length : %d\n", meta->freq_cnt_b) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (fprintf(ofp, "# suffix array sample rate: %d\n", meta->freq_SA) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (fprintf(ofp, "# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (appname) free(appname); return eslOK; ERROR: if (appname) free(appname); return status; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; struct cfg_s cfg; int status = eslOK; impl_Init(); /* processor specific initialization */ p7_FLogsumInit(); /* we're going to use table-driven Logsum() approximations at times */ /* Initialize what we can in the config structure (without knowing the alphabet yet) */ cfg.hmmfile = NULL; cfg.dbfile = NULL; cfg.do_mpi = FALSE; /* this gets reset below, if we init MPI */ cfg.nproc = 0; /* this gets reset below, if we init MPI */ cfg.my_rank = 0; /* this gets reset below, if we init MPI */ cfg.firstseq_key = NULL; cfg.n_targetseq = -1; process_commandline(argc, argv, &go, &cfg.hmmfile, &cfg.dbfile); /* is the range restricted? */ if (esl_opt_IsUsed(go, "--restrictdb_stkey") ) if ((cfg.firstseq_key = esl_opt_GetString(go, "--restrictdb_stkey")) == NULL) p7_Fail("Failure capturing --restrictdb_stkey\n"); if (esl_opt_IsUsed(go, "--restrictdb_n") ) cfg.n_targetseq = esl_opt_GetInteger(go, "--restrictdb_n"); if ( cfg.n_targetseq != -1 && cfg.n_targetseq < 1 ) p7_Fail("--restrictdb_n must be >= 1\n"); { status = serial_master(go, &cfg); } esl_getopts_Destroy(go); return status; }
static int output_header(FILE *ofp, const ESL_GETOPTS *go) { p7_banner(ofp, go->argv[0], banner); if (esl_opt_IsUsed(go, "--eval2score")) { if ( fprintf(ofp, "# show score required to reach E-value: %.2g\n", esl_opt_GetReal(go, "-E")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); } if (esl_opt_IsUsed(go, "--score2eval")) { if ( fprintf(ofp, "# show E-value corresponding to score: %.2g\n", esl_opt_GetReal(go, "-S")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); } if (esl_opt_IsUsed(go, "--eval2score") || esl_opt_IsUsed(go, "--score2eval")) { if (esl_opt_IsUsed(go, "--baseZ") ) { if ( fprintf(ofp, "# using base count (search both strands): %d Mb\n", esl_opt_GetInteger(go, "--baseZ")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); } else if (esl_opt_IsUsed(go, "--baseZ1") ) { if ( fprintf(ofp, "# using base count (search single strand): %d Mb\n", esl_opt_GetInteger(go, "--baseZ1")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); } else { if ( fprintf(ofp, "# using database sequence count: %d\n", esl_opt_GetInteger(go, "-Z")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); } } if (fprintf(ofp, "# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); return eslOK; }
static void process_SearchCmd(HMMD_COMMAND *cmd, WORKER_ENV *env) { int i; int cnt; int limit; int status; int blk_size; WORKER_INFO *info = NULL; ESL_ALPHABET *abc; ESL_STOPWATCH *w; ESL_THREADS *threadObj = NULL; pthread_mutex_t inx_mutex; int current_index; QUEUE_DATA *query = NULL; time_t date; char timestamp[32]; w = esl_stopwatch_Create(); abc = esl_alphabet_Create(eslAMINO); if (pthread_mutex_init(&inx_mutex, NULL) != 0) p7_Fail("mutex init failed"); ESL_ALLOC(info, sizeof(*info) * env->ncpus); /* Log the current time (at search start) */ date = time(NULL); ctime_r(&date, timestamp); printf("\n%s", timestamp); /* note that ctime_r() leaves \n on end of timestamp */ /* initialize thread data */ query = process_QueryCmd(cmd, env); esl_stopwatch_Start(w); info->range_list = NULL; if (esl_opt_IsUsed(query->opts, "--seqdb_ranges")) { ESL_ALLOC(info->range_list, sizeof(RANGE_LIST)); hmmpgmd_GetRanges(info->range_list, esl_opt_GetString(query->opts, "--seqdb_ranges")); } if (query->cmd_type == HMMD_CMD_SEARCH) threadObj = esl_threads_Create(&search_thread); else threadObj = esl_threads_Create(&scan_thread); if (query->query_type == HMMD_SEQUENCE) { fprintf(stdout, "Search seq %s [L=%ld]", query->seq->name, (long) query->seq->n); } else { fprintf(stdout, "Search hmm %s [M=%d]", query->hmm->name, query->hmm->M); } fprintf(stdout, " vs %s DB %d [%d - %d]", (query->cmd_type == HMMD_CMD_SEARCH) ? "SEQ" : "HMM", query->dbx, query->inx, query->inx + query->cnt - 1); if (info->range_list) fprintf(stdout, " in range(s) %s", esl_opt_GetString(query->opts, "--seqdb_ranges")); fprintf(stdout, "\n"); /* Create processing pipeline and hit list */ for (i = 0; i < env->ncpus; ++i) { info[i].abc = query->abc; info[i].hmm = query->hmm; info[i].seq = query->seq; info[i].opts = query->opts; info[i].range_list = info[0].range_list; info[i].th = NULL; info[i].pli = NULL; info[i].inx_mutex = &inx_mutex; info[i].inx = ¤t_index;/* this is confusing trickery - to share a single variable across all threads */ info[i].blk_size = &blk_size; /* ditto */ info[i].limit = &limit; /* ditto. TODO: come back and clean this up. */ if (query->cmd_type == HMMD_CMD_SEARCH) { HMMER_SEQ **list = env->seq_db->db[query->dbx].list; info[i].sq_list = &list[query->inx]; info[i].sq_cnt = query->cnt; info[i].db_Z = env->seq_db->db[query->dbx].K; info[i].om_list = NULL; info[i].om_cnt = 0; } else { info[i].sq_list = NULL; info[i].sq_cnt = 0; info[i].db_Z = 0; info[i].om_list = &env->hmm_db->list[query->inx]; info[i].om_cnt = query->cnt; } esl_threads_AddThread(threadObj, &info[i]); } /* try block size of 5000. we will need enough sequences for four * blocks per thread or better. */ blk_size = 5000; cnt = query->cnt / env->ncpus / blk_size; limit = query->cnt * 2 / 3; if (cnt < 4) { /* try block size of 1000 */ blk_size /= 5; cnt = query->cnt / env->ncpus / blk_size; if (cnt < 4) { /* still not enough. just divide it up into one block per thread */ blk_size = query->cnt / env->ncpus + 1; limit = query->cnt * 2; } } current_index = 0; esl_threads_WaitForStart(threadObj); esl_threads_WaitForFinish(threadObj); esl_stopwatch_Stop(w); #if 1 fprintf (stdout, " Sequences Residues Elapsed\n"); for (i = 0; i < env->ncpus; ++i) { print_timings(i, info[i].elapsed, info[i].pli); } #endif /* merge the results of the search results */ for (i = 1; i < env->ncpus; ++i) { p7_tophits_Merge(info[0].th, info[i].th); p7_pipeline_Merge(info[0].pli, info[i].pli); p7_pipeline_Destroy(info[i].pli); p7_tophits_Destroy(info[i].th); } print_timings(99, w->elapsed, info[0].pli); send_results(env->fd, w, info); /* free the last of the pipeline data */ p7_pipeline_Destroy(info->pli); p7_tophits_Destroy(info->th); free_QueueData(query); esl_threads_Destroy(threadObj); pthread_mutex_destroy(&inx_mutex); if (info->range_list) { if (info->range_list->starts) free(info->range_list->starts); if (info->range_list->ends) free(info->range_list->ends); free (info->range_list); } free(info); esl_stopwatch_Destroy(w); esl_alphabet_Destroy(abc); return; ERROR: LOG_FATAL_MSG("malloc", errno); }
static int output_header(const ESL_GETOPTS *go, const struct cfg_s *cfg) { if (cfg->my_rank > 0) return eslOK; /* if (! cfg->be_verbose) return eslOK; */ p7_banner(cfg->ofp, go->argv[0], banner); fprintf(cfg->ofp, "# input alignment file: %s\n", cfg->alifile); fprintf(cfg->ofp, "# output HMM file: %s\n", cfg->hmmfile); if (esl_opt_IsUsed(go, "-n")) fprintf(cfg->ofp, "# name (the single) HMM: %s\n", esl_opt_GetString(go, "-n")); if (esl_opt_IsUsed(go, "-o")) fprintf(cfg->ofp, "# output directed to file: %s\n", esl_opt_GetString(go, "-o")); if (esl_opt_IsUsed(go, "-O")) fprintf(cfg->ofp, "# processed alignment resaved to: %s\n", esl_opt_GetString(go, "-O")); if (esl_opt_IsUsed(go, "--amino")) fprintf(cfg->ofp, "# input alignment is asserted as: protein\n"); if (esl_opt_IsUsed(go, "--dna")) fprintf(cfg->ofp, "# input alignment is asserted as: DNA\n"); if (esl_opt_IsUsed(go, "--rna")) fprintf(cfg->ofp, "# input alignment is asserted as: RNA\n"); if (esl_opt_IsUsed(go, "--fast")) fprintf(cfg->ofp, "# model architecture construction: fast/heuristic\n"); if (esl_opt_IsUsed(go, "--hand")) fprintf(cfg->ofp, "# model architecture construction: hand-specified by RF annotation\n"); if (esl_opt_IsUsed(go, "--symfrac")) fprintf(cfg->ofp, "# sym fraction for model structure: %.3f\n", esl_opt_GetReal(go, "--symfrac")); if (esl_opt_IsUsed(go, "--fragthresh"))fprintf(cfg->ofp, "# seq called fragment if < xL : %.3f\n", esl_opt_GetReal(go, "--fragthresh")); if (esl_opt_IsUsed(go, "--wpb")) fprintf(cfg->ofp, "# relative weighting scheme: Henikoff PB\n"); if (esl_opt_IsUsed(go, "--wgsc")) fprintf(cfg->ofp, "# relative weighting scheme: G/S/C\n"); if (esl_opt_IsUsed(go, "--wblosum")) fprintf(cfg->ofp, "# relative weighting scheme: BLOSUM filter\n"); if (esl_opt_IsUsed(go, "--wnone")) fprintf(cfg->ofp, "# relative weighting scheme: none\n"); if (esl_opt_IsUsed(go, "--wid")) fprintf(cfg->ofp, "# frac id cutoff for BLOSUM wgts: %f\n", esl_opt_GetReal(go, "--wid")); if (esl_opt_IsUsed(go, "--eent")) fprintf(cfg->ofp, "# effective seq number scheme: entropy weighting\n"); if (esl_opt_IsUsed(go, "--eclust")) fprintf(cfg->ofp, "# effective seq number scheme: single linkage clusters\n"); if (esl_opt_IsUsed(go, "--enone")) fprintf(cfg->ofp, "# effective seq number scheme: none\n"); if (esl_opt_IsUsed(go, "--eset")) fprintf(cfg->ofp, "# effective seq number: set to %f\n", esl_opt_GetReal(go, "--eset")); if (esl_opt_IsUsed(go, "--ere") ) fprintf(cfg->ofp, "# minimum rel entropy target: %f bits\n", esl_opt_GetReal(go, "--ere")); if (esl_opt_IsUsed(go, "--esigma") ) fprintf(cfg->ofp, "# entropy target sigma parameter: %f bits\n", esl_opt_GetReal(go, "--esigma")); if (esl_opt_IsUsed(go, "--eid") ) fprintf(cfg->ofp, "# frac id cutoff for --eclust: %f\n", esl_opt_GetReal(go, "--eid")); if (esl_opt_IsUsed(go, "--EmL") ) fprintf(cfg->ofp, "# seq length for MSV Gumbel mu fit: %d\n", esl_opt_GetInteger(go, "--EmL")); if (esl_opt_IsUsed(go, "--EmN") ) fprintf(cfg->ofp, "# seq number for MSV Gumbel mu fit: %d\n", esl_opt_GetInteger(go, "--EmN")); if (esl_opt_IsUsed(go, "--EvL") ) fprintf(cfg->ofp, "# seq length for Vit Gumbel mu fit: %d\n", esl_opt_GetInteger(go, "--EvL")); if (esl_opt_IsUsed(go, "--EvN") ) fprintf(cfg->ofp, "# seq number for Vit Gumbel mu fit: %d\n", esl_opt_GetInteger(go, "--EvN")); if (esl_opt_IsUsed(go, "--EfL") ) fprintf(cfg->ofp, "# seq length for Fwd exp tau fit: %d\n", esl_opt_GetInteger(go, "--EfL")); if (esl_opt_IsUsed(go, "--EfN") ) fprintf(cfg->ofp, "# seq number for Fwd exp tau fit: %d\n", esl_opt_GetInteger(go, "--EfN")); if (esl_opt_IsUsed(go, "--Eft") ) fprintf(cfg->ofp, "# tail mass for Fwd exp tau fit: %f\n", esl_opt_GetReal(go, "--Eft")); #ifdef HMMER_THREADS if (esl_opt_IsUsed(go, "--cpu")) fprintf(cfg->ofp, "# number of worker threads: %d\n", esl_opt_GetInteger(go, "--cpu")); #endif #ifdef HAVE_MPI if (esl_opt_IsUsed(go, "--mpi") ) fprintf(cfg->ofp, "# parallelization mode: MPI\n"); #endif if (esl_opt_IsUsed(go, "--seed")) { if (esl_opt_GetInteger(go, "--seed") == 0) fprintf(cfg->ofp,"# random number seed: one-time arbitrary\n"); else fprintf(cfg->ofp,"# random number seed set to: %d\n", esl_opt_GetInteger(go, "--seed")); } if (esl_opt_IsUsed(go, "--laplace") ) fprintf(cfg->ofp, "# prior: Laplace +1\n"); fprintf(cfg->ofp, "# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n\n"); return eslOK; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* command line processing */ ESL_ALPHABET *abc = NULL; char *hmmfile = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; int nhmm; double x; float KL; int status; char errbuf[eslERRBUFSIZE]; float nseq; int do_eval2score = 0; int do_score2eval = 0; int z_val; float e_val; float s_val; /* Process the command line options. */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK || esl_opt_VerifyConfig(go) != eslOK) { printf("Failed to parse command line: %s\n", go->errbuf); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if (esl_opt_GetBoolean(go, "-h") == TRUE) { p7_banner(stdout, argv[0], banner); esl_usage(stdout, argv[0], usage); puts("\nOptions:"); esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=docgroup, 2 = indentation; 80=textwidth*/ exit(0); } if (esl_opt_ArgNumber(go) != 1) { puts("Incorrect number of command line arguments."); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if ((hmmfile = esl_opt_GetArg(go, 1)) == NULL) { puts("Failed to read <hmmfile> argument from command line."); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } output_header(stdout, go); if ( esl_opt_IsOn(go, "--eval2score") ) { do_eval2score = TRUE; e_val = esl_opt_GetReal(go, "-E"); } else if ( esl_opt_IsOn(go, "--score2eval") ) { do_score2eval = TRUE; s_val = esl_opt_GetReal(go, "-S"); } else if ( esl_opt_IsUsed(go, "--baseZ") || esl_opt_IsUsed(go, "--baseZ1") || esl_opt_IsUsed(go, "-Z") ) { puts("The flags -Z, --baseZ, and --baseZ1 are for use with --eval2score and --score2eval."); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if (esl_opt_IsUsed(go, "--baseZ") ) { z_val = 1000000 * 2 * (long)(esl_opt_GetInteger(go, "--baseZ")); } else if (esl_opt_IsUsed(go, "--baseZ1") ) { z_val = 1000000 * (long)(esl_opt_GetInteger(go, "--baseZ1")); } else { z_val = esl_opt_GetInteger(go, "-Z"); } /* Initializations: open the HMM file */ status = p7_hmmfile_OpenE(hmmfile, NULL, &hfp, errbuf); if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status == eslEFORMAT) p7_Fail("File format problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, hmmfile, errbuf); /* Main body: read HMMs one at a time, print one line of stats */ printf("#\n"); printf("# %-4s %-20s %-12s %8s %8s %6s %6s %6s %6s %6s", "idx", "name", "accession", "nseq", "eff_nseq", "M", "relent", "info", "p relE", "compKL"); if (do_eval2score) printf (" %6s %6.2g", "sc for", e_val); if (do_score2eval) printf (" %6s %6.2f", "E-val for", s_val); printf("\n"); printf("# %-4s %-20s %-12s %8s %8s %6s %6s %6s %6s %6s", "----", "--------------------", "------------", "--------", "--------", "------", "------", "------", "------", "------"); if (do_eval2score) printf (" %13s", "-------------"); if (do_score2eval) printf (" %13s", "-------------"); printf("\n"); nhmm = 0; while ((status = p7_hmmfile_Read(hfp, &abc, &hmm)) != eslEOF) { if (status == eslEOD) esl_fatal("read failed, HMM file %s may be truncated?", hmmfile); else if (status == eslEFORMAT) esl_fatal("bad file format in HMM file %s", hmmfile); else if (status == eslEINCOMPAT) esl_fatal("HMM file %s contains different alphabets", hmmfile); else if (status != eslOK) esl_fatal("Unexpected error in reading HMMs from %s", hmmfile); nhmm++; if ( esl_opt_IsOn(go, "--eval2score") || esl_opt_IsOn(go, "--score2eval") ) { if (esl_opt_IsUsed(go, "--baseZ") || esl_opt_IsUsed(go, "--baseZ1" ) ) { if ( hmm->abc->type != eslRNA && hmm->abc->type != eslDNA) { puts("The flags --baseZ and --baseZ1 can't be used with non-nucleotide models."); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } } else if ( hmm->abc->type != eslAMINO && hmm->abc->type != eslRNA && hmm->abc->type != eslDNA) { puts("The flags --eval2score and --score2eval can't be used with non-sequence models."); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } } if (esl_opt_IsUsed(go, "--baseZ") ) { nseq = (float)z_val / (float)(hmm->max_length); } else if (esl_opt_IsUsed(go, "--baseZ1") ) { nseq = (float)z_val / (float)(hmm->max_length); } else { nseq = z_val; } if (bg == NULL) bg = p7_bg_Create(abc); p7_MeanPositionRelativeEntropy(hmm, bg, &x); p7_hmm_CompositionKLDist(hmm, bg, &KL, NULL); printf("%-6d %-20s %-12s %8d %8.2f %6d %6.2f %6.2f %6.2f %6.2f", nhmm, hmm->name, hmm->acc == NULL ? "-" : hmm->acc, hmm->nseq, hmm->eff_nseq, hmm->M, p7_MeanMatchRelativeEntropy(hmm, bg), p7_MeanMatchInfo(hmm, bg), x, KL); if ( do_eval2score ) { float sc; sc = esl_exp_invsurv( e_val / nseq , hmm->evparam[p7_FTAU], hmm->evparam[p7_FLAMBDA]); printf("%13.2f", sc); } else if ( do_score2eval) { float e; e = nseq * esl_exp_surv( s_val , hmm->evparam[p7_FTAU], hmm->evparam[p7_FLAMBDA]); printf("%13.2g", e); } printf("\n"); /* p7_MeanForwardScore(hmm, bg)); */ p7_hmm_Destroy(hmm); } p7_bg_Destroy(bg); esl_alphabet_Destroy(abc); p7_hmmfile_Close(hfp); esl_getopts_Destroy(go); exit(0); }
/* serial_master() * The serial version of hmmsearch. * For each query HMM in <hmmdb> search the database for hits. * * A master can only return if it's successful. All errors are handled * immediately and fatally with p7_Fail(). */ static int serial_master(ESL_GETOPTS *go, struct cfg_s *cfg) { FILE *ofp = stdout; /* output file for results (default stdout) */ FILE *tblfp = NULL; /* output stream for tabular per-seq (--tblout) */ FILE *dfamtblfp = NULL; /* output stream for tabular Dfam format (--dfamtblout) */ FILE *aliscoresfp = NULL; /* output stream for alignment scores (--aliscoresout) */ // P7_HMM *hmm = NULL; /* one HMM query */ // P7_SCOREDATA *scoredata = NULL; int seqfmt = eslSQFILE_UNKNOWN; /* format of seqfile */ ESL_SQFILE *sqfp = NULL; /* open seqfile */ P7_HMMFILE *hfp = NULL; /* open HMM database file */ ESL_ALPHABET *abc = NULL; /* sequence alphabet */ P7_OPROFILE *om = NULL; /* target profile */ ESL_STOPWATCH *w = NULL; /* timing */ ESL_SQ *qsq = NULL; /* query sequence */ int nquery = 0; int textw; int status = eslOK; int hstatus = eslOK; int sstatus = eslOK; int i; int ncpus = 0; int infocnt = 0; WORKER_INFO *info = NULL; #ifdef HMMER_THREADS P7_OM_BLOCK *block = NULL; ESL_THREADS *threadObj= NULL; ESL_WORK_QUEUE *queue = NULL; #endif char errbuf[eslERRBUFSIZE]; double window_beta = -1.0 ; int window_length = -1; if (esl_opt_IsUsed(go, "--w_beta")) { if ( ( window_beta = esl_opt_GetReal(go, "--w_beta") ) < 0 || window_beta > 1 ) esl_fatal("Invalid window-length beta value\n"); } if (esl_opt_IsUsed(go, "--w_length")) { if (( window_length = esl_opt_GetInteger(go, "--w_length")) < 4 ) esl_fatal("Invalid window length value\n"); } w = esl_stopwatch_Create(); if (esl_opt_GetBoolean(go, "--notextw")) textw = 0; else textw = esl_opt_GetInteger(go, "--textw"); /* If caller declared an input format, decode it */ if (esl_opt_IsOn(go, "--qformat")) { seqfmt = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--qformat")); if (seqfmt == eslSQFILE_UNKNOWN) p7_Fail("%s is not a recognized input sequence file format\n", esl_opt_GetString(go, "--qformat")); } /* validate options if running as a daemon */ // if (esl_opt_IsOn(go, "--daemon")) { /* running as a daemon, the input format must be type daemon */ // if (seqfmt != eslSQFILE_UNKNOWN && seqfmt != eslSQFILE_DAEMON) // esl_fatal("Input format %s not supported. Must be daemon\n", esl_opt_GetString(go, "--qformat")); // seqfmt = eslSQFILE_DAEMON; // if (strcmp(cfg->seqfile, "-") != 0) esl_fatal("Query sequence file must be '-'\n"); // } /* Open the target profile database to get the sequence alphabet */ status = p7_hmmfile_OpenE(cfg->hmmfile, p7_HMMDBENV, &hfp, errbuf); if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", cfg->hmmfile, errbuf); else if (status == eslEFORMAT) p7_Fail("File format problem, trying to open HMM file %s.\n%s\n", cfg->hmmfile, errbuf); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, cfg->hmmfile, errbuf); if (! hfp->is_pressed) p7_Fail("Failed to open binary auxfiles for %s: use hmmpress first\n", hfp->fname); hstatus = p7_oprofile_ReadMSV(hfp, &abc, &om); if (hstatus == eslEFORMAT) p7_Fail("bad format, binary auxfiles, %s:\n%s", cfg->hmmfile, hfp->errbuf); else if (hstatus == eslEINCOMPAT) p7_Fail("HMM file %s contains different alphabets", cfg->hmmfile); else if (hstatus != eslOK) p7_Fail("Unexpected error in reading HMMs from %s", cfg->hmmfile); p7_oprofile_Destroy(om); p7_hmmfile_Close(hfp); /* Open the query sequence database */ status = esl_sqfile_OpenDigital(abc, cfg->seqfile, seqfmt, NULL, &sqfp); if (status == eslENOTFOUND) p7_Fail("Failed to open sequence file %s for reading\n", cfg->seqfile); else if (status == eslEFORMAT) p7_Fail("Sequence file %s is empty or misformatted\n", cfg->seqfile); else if (status == eslEINVAL) p7_Fail("Can't autodetect format of a stdin or .gz seqfile"); else if (status != eslOK) p7_Fail("Unexpected error %d opening sequence file %s\n", status, cfg->seqfile); if (sqfp->format > 100) // breaking the law! That range is reserved for msa, for aligned formats p7_Fail("%s contains a multiple sequence alignment; expect unaligned sequences, like FASTA\n", cfg->seqfile); qsq = esl_sq_CreateDigital(abc); /* Open the results output files */ if (esl_opt_IsOn(go, "-o")) { if ((ofp = fopen(esl_opt_GetString(go, "-o"), "w")) == NULL) esl_fatal("Failed to open output file %s for writing\n", esl_opt_GetString(go, "-o")); } if (esl_opt_IsOn(go, "--tblout")) { if ((tblfp = fopen(esl_opt_GetString(go, "--tblout"), "w")) == NULL) esl_fatal("Failed to open tabular per-seq output file %s for writing\n", esl_opt_GetString(go, "--tblfp")); } if (esl_opt_IsOn(go, "--dfamtblout")) { if ((dfamtblfp = fopen(esl_opt_GetString(go, "--dfamtblout"),"w")) == NULL) esl_fatal("Failed to open tabular dfam output file %s for writing\n", esl_opt_GetString(go, "--dfamtblout")); } if (esl_opt_IsOn(go, "--aliscoresout")) { if ((aliscoresfp = fopen(esl_opt_GetString(go, "--aliscoresout"),"w")) == NULL) esl_fatal("Failed to open alignment scores output file %s for writing\n", esl_opt_GetString(go, "--aliscoresout")); } output_header(ofp, go, cfg->hmmfile, cfg->seqfile); #ifdef HMMER_THREADS /* initialize thread data */ if (esl_opt_IsOn(go, "--cpu")) ncpus = esl_opt_GetInteger(go, "--cpu"); else esl_threads_CPUCount(&ncpus); if (ncpus > 0) { threadObj = esl_threads_Create(&pipeline_thread); queue = esl_workqueue_Create(ncpus * 2); } #endif infocnt = (ncpus == 0) ? 1 : ncpus; ESL_ALLOC(info, sizeof(*info) * infocnt); for (i = 0; i < infocnt; ++i) { info[i].bg = p7_bg_Create(abc); #ifdef HMMER_THREADS info[i].queue = queue; #endif } #ifdef HMMER_THREADS for (i = 0; i < ncpus * 2; ++i) { block = p7_oprofile_CreateBlock(BLOCK_SIZE); if (block == NULL) esl_fatal("Failed to allocate sequence block"); status = esl_workqueue_Init(queue, block); if (status != eslOK) esl_fatal("Failed to add block to work queue"); } #endif /* Outside loop: over each query sequence in <seqfile>. */ while ((sstatus = esl_sqio_Read(sqfp, qsq)) == eslOK) { if (sstatus == eslEMEM) p7_Fail("Memory allocation error reading sequence file\n", status); if (sstatus == eslEINCONCEIVABLE) p7_Fail("Unexpected error %d reading sequence file\n", status); // if (qsq->L > NHMMER_MAX_RESIDUE_COUNT) p7_Fail("Input sequence %s in file %s exceeds maximum length of %d bases.\n", qsq->name, cfg->seqfile, NHMMER_MAX_RESIDUE_COUNT); nquery++; esl_stopwatch_Start(w); /* Open the target profile database */ status = p7_hmmfile_OpenE(cfg->hmmfile, p7_HMMDBENV, &hfp, NULL); if (status != eslOK) p7_Fail("Unexpected error %d in opening hmm file %s.\n", status, cfg->hmmfile); #ifdef HMMER_THREADS /* if we are threaded, create a lock to prevent multiple readers */ if (ncpus > 0) { status = p7_hmmfile_CreateLock(hfp); if (status != eslOK) p7_Fail("Unexpected error %d creating lock\n", status); } #endif if (fprintf(ofp, "Query: %s [L=%ld]\n", qsq->name, (long) qsq->n) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (qsq->acc[0] != 0 && fprintf(ofp, "Accession: %s\n", qsq->acc) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (qsq->desc[0] != 0 && fprintf(ofp, "Description: %s\n", qsq->desc) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); for (i = 0; i < infocnt; ++i) { /* Create processing pipeline and hit list */ info[i].th = p7_tophits_Create(); info[i].pli = p7_pipeline_Create(go, 100, 100, TRUE, p7_SCAN_MODELS); /* M_hint = 100, L_hint = 100 are just dummies for now */ info[i].pli->hfp = hfp; /* for two-stage input, pipeline needs <hfp> */ p7_pli_NewSeq(info[i].pli, qsq); info[i].qsq = qsq; if ( esl_opt_IsUsed(go, "--toponly") ) info[i].pli->strand = p7_STRAND_TOPONLY; else if ( esl_opt_IsUsed(go, "--bottomonly") ) info[i].pli->strand = p7_STRAND_BOTTOMONLY; else info[i].pli->strand = p7_STRAND_BOTH; #ifdef HMMER_THREADS if (ncpus > 0) esl_threads_AddThread(threadObj, &info[i]); #endif } #ifdef HMMER_THREADS if (ncpus > 0) hstatus = thread_loop(threadObj, queue, hfp); else hstatus = serial_loop(info, hfp); #else hstatus = serial_loop(info, hfp); #endif switch(hstatus) { case eslEFORMAT: p7_Fail("bad file format in HMM file %s", cfg->hmmfile); break; case eslEINCOMPAT: p7_Fail("HMM file %s contains different alphabets", cfg->hmmfile); break; case eslEOF: case eslOK: /* do nothing */ break; default: p7_Fail("Unexpected error in reading HMMs from %s", cfg->hmmfile); } /* merge the results of the search results */ for (i = 1; i < infocnt; ++i) { p7_tophits_Merge(info[0].th, info[i].th); p7_pipeline_Merge(info[0].pli, info[i].pli); p7_pipeline_Destroy(info[i].pli); p7_tophits_Destroy(info[i].th); } /* modify e-value to account for number of models */ for (i = 0; i < info->th->N ; i++) { info->th->unsrt[i].lnP += log((float)info->pli->nmodels); info->th->unsrt[i].dcl[0].lnP = info->th->unsrt[i].lnP; info->th->unsrt[i].sortkey = -1.0 * info->th->unsrt[i].lnP; } /* it's possible to have duplicates based on how viterbi ranges can overlap */ p7_tophits_SortByModelnameAndAlipos(info->th); p7_tophits_RemoveDuplicates(info->th, info->pli->use_bit_cutoffs); /* Print results */ p7_tophits_SortBySortkey(info->th); p7_tophits_Threshold(info->th, info->pli); //tally up total number of hits and target coverage info->pli->n_output = info->pli->pos_output = 0; for (i = 0; i < info->th->N; i++) { if ( (info->th->hit[i]->flags & p7_IS_REPORTED) || info->th->hit[i]->flags & p7_IS_INCLUDED) { info->pli->n_output++; info->pli->pos_output += abs(info->th->hit[i]->dcl[0].jali - info->th->hit[i]->dcl[0].iali) + 1; } } p7_tophits_Targets(ofp, info->th, info->pli, textw); if (fprintf(ofp, "\n\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); p7_tophits_Domains(ofp, info->th, info->pli, textw); if (fprintf(ofp, "\n\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (tblfp) p7_tophits_TabularTargets(tblfp, qsq->name, qsq->acc, info->th, info->pli, (nquery == 1)); if (dfamtblfp) p7_tophits_TabularXfam(dfamtblfp, qsq->name, NULL, info->th, info->pli); if (aliscoresfp) p7_tophits_AliScores(aliscoresfp, qsq->name, info->th ); esl_stopwatch_Stop(w); info->pli->nseqs = 1; p7_pli_Statistics(ofp, info->pli, w); if (fprintf(ofp, "//\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); fflush(ofp); p7_hmmfile_Close(hfp); p7_pipeline_Destroy(info->pli); p7_tophits_Destroy(info->th); esl_sq_Reuse(qsq); } if (sstatus == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n", sqfp->filename, esl_sqfile_GetErrorBuf(sqfp)); else if (sstatus != eslEOF) esl_fatal("Unexpected error %d reading sequence file %s", sstatus, sqfp->filename); /* Terminate outputs - any last words? */ if (tblfp) p7_tophits_TabularTail(tblfp, "hmmscan", p7_SCAN_MODELS, cfg->seqfile, cfg->hmmfile, go); if (ofp) { if (fprintf(ofp, "[ok]\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); } /* Cleanup - prepare for successful exit */ for (i = 0; i < infocnt; ++i) p7_bg_Destroy(info[i].bg); #ifdef HMMER_THREADS if (ncpus > 0) { esl_workqueue_Reset(queue); while (esl_workqueue_Remove(queue, (void **) &block) == eslOK) p7_oprofile_DestroyBlock(block); esl_workqueue_Destroy(queue); esl_threads_Destroy(threadObj); } #endif free(info); esl_sq_Destroy(qsq); esl_stopwatch_Destroy(w); esl_alphabet_Destroy(abc); esl_sqfile_Close(sqfp); if (ofp != stdout) fclose(ofp); if (tblfp) fclose(tblfp); if (dfamtblfp) fclose(dfamtblfp); if (aliscoresfp) fclose(aliscoresfp); return eslOK; ERROR: if (ofp != stdout) fclose(ofp); if (tblfp) fclose(tblfp); if (dfamtblfp) fclose(dfamtblfp); if (aliscoresfp) fclose(aliscoresfp); return status; }
static int output_header(FILE *ofp, ESL_GETOPTS *go, char *hmmfile, char *seqfile) { p7_banner(ofp, go->argv[0], banner); if (fprintf(ofp, "# query sequence file: %s\n", seqfile) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (fprintf(ofp, "# target HMM database: %s\n", hmmfile) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "-o") && fprintf(ofp, "# output directed to file: %s\n", esl_opt_GetString(go, "-o")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--tblout") && fprintf(ofp, "# per-seq hits tabular output: %s\n", esl_opt_GetString(go, "--tblout")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--dfamtblout") && fprintf(ofp, "# hits output in Dfam format: %s\n", esl_opt_GetString(go, "--dfamtblout")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--aliscoresout") && fprintf(ofp, "# alignment scores output: %s\n", esl_opt_GetString(go, "--aliscoresout")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--acc") && fprintf(ofp, "# prefer accessions over names: yes\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--noali") && fprintf(ofp, "# show alignments in output: no\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--notextw") && fprintf(ofp, "# max ASCII text line length: unlimited\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--textw") && fprintf(ofp, "# max ASCII text line length: %d\n", esl_opt_GetInteger(go, "--textw")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "-E") && fprintf(ofp, "# profile reporting threshold: E-value <= %g\n", esl_opt_GetReal(go, "-E")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "-T") && fprintf(ofp, "# profile reporting threshold: score >= %g\n", esl_opt_GetReal(go, "-T")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--incE") && fprintf(ofp, "# profile inclusion threshold: E-value <= %g\n", esl_opt_GetReal(go, "--incE")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--incT") && fprintf(ofp, "# profile inclusion threshold: score >= %g\n", esl_opt_GetReal(go, "--incT")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--cut_ga") && fprintf(ofp, "# model-specific thresholding: GA cutoffs\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--cut_nc") && fprintf(ofp, "# model-specific thresholding: NC cutoffs\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--cut_tc") && fprintf(ofp, "# model-specific thresholding: TC cutoffs\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--max") && fprintf(ofp, "# Max sensitivity mode: on [all heuristic filters off]\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--F1") && fprintf(ofp, "# MSV filter P threshold: <= %g\n", esl_opt_GetReal(go, "--F1")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--F2") && fprintf(ofp, "# Vit filter P threshold: <= %g\n", esl_opt_GetReal(go, "--F2")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--F3") && fprintf(ofp, "# Fwd filter P threshold: <= %g\n", esl_opt_GetReal(go, "--F3")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--nobias") && fprintf(ofp, "# biased composition HMM filter: off\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--B1") && fprintf(ofp, "# biased comp MSV window len: %d\n", esl_opt_GetInteger(go, "--B1")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--B2") && fprintf(ofp, "# biased comp Viterbi window len: %d\n", esl_opt_GetInteger(go, "--B2")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--B3") && fprintf(ofp, "# biased comp Forward window len: %d\n", esl_opt_GetInteger(go, "--B3")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--nonull2") && fprintf(ofp, "# null2 bias corrections: off\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--toponly") && fprintf(ofp, "# search only top strand: on\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--bottomonly") && fprintf(ofp, "# search only bottom strand: on\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "-Z") && fprintf(ofp, "# sequence search space set to: %.0f\n", esl_opt_GetReal(go, "-Z")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--seed")) { if (esl_opt_GetInteger(go, "--seed")==0 && fprintf(ofp, "# random number seed: one-time arbitrary\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); else if ( fprintf(ofp, "# random number seed set to: %d\n", esl_opt_GetInteger(go, "--seed")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); } if (esl_opt_IsUsed(go, "--qformat") && fprintf(ofp, "# input seqfile format asserted: %s\n", esl_opt_GetString(go, "--qformat")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--w_beta") && fprintf(ofp, "# window length beta value: %g\n", esl_opt_GetReal(go, "--w_beta")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--w_length") && fprintf(ofp, "# window length : %d\n", esl_opt_GetInteger(go, "--w_length")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); // if (esl_opt_IsUsed(go, "--daemon") && fprintf(ofp, "run as a daemon process\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); #ifdef HMMER_THREADS if (esl_opt_IsUsed(go, "--cpu") && fprintf(ofp, "# number of worker threads: %d\n", esl_opt_GetInteger(go, "--cpu")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); #endif if (fprintf(ofp, "# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); return eslOK; }
static int process_commandline(int argc, char **argv, ESL_GETOPTS **ret_go, char **ret_alifile, char **ret_postalifile) { ESL_GETOPTS *go = esl_getopts_Create(options); int status; if (esl_opt_ProcessEnvironment(go) != eslOK) { if (printf("Failed to process environment:\n%s\n", go->errbuf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; } if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) { if (printf("Failed to parse command line:\n%s\n", go->errbuf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; } if (esl_opt_VerifyConfig(go) != eslOK) { if (printf("Failed to parse command line:\n%s\n", go->errbuf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; } /* help format: */ if (esl_opt_GetBoolean(go, "-h") == TRUE) { p7_banner(stdout, argv[0], banner); esl_usage(stdout, argv[0], usage); if (puts("\nBasic options:") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); esl_opt_DisplayHelp(stdout, go, 1, 2, 80); if (puts("\nMask range options (format: --xxx 10-20,30-40 ) :") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); esl_opt_DisplayHelp(stdout, go, 5, 2, 80); if (puts("\nOptions for selecting alphabet rather than guessing it:") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); esl_opt_DisplayHelp(stdout, go, 2, 2, 80); if (puts("\nAlternative model construction strategies:") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); esl_opt_DisplayHelp(stdout, go, 3, 2, 80); if (puts("\nAlternative relative sequence weighting strategies:") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); esl_opt_DisplayHelp(stdout, go, 4, 2, 80); if (puts("\nOther options:") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); esl_opt_DisplayHelp(stdout, go, 8, 2, 80); exit(0); } if (esl_opt_ArgNumber(go) > 2) { if (puts("Incorrect number of command line arguments.") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; } if ((*ret_alifile = esl_opt_GetArg(go, 1)) == NULL) { if (puts("Failed to get <msafile> argument on command line") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; } if (esl_opt_IsUsed(go, "--alirange") || esl_opt_IsUsed(go, "--modelrange") ) { if ((*ret_postalifile = esl_opt_GetArg(go, 2)) == NULL) { if (puts("Failed to get <postmsafile> argument on command line") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; } } if (strcmp(*ret_alifile, "-") == 0 && ! esl_opt_IsOn(go, "--informat")) { if (puts("Must specify --informat to read <alifile> from stdin ('-')") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; } *ret_go = go; return eslOK; FAILURE: /* all errors handled here are user errors, so be polite. */ esl_usage(stdout, argv[0], usage); puts("\nwhere basic options are:"); esl_opt_DisplayHelp(stdout, go, 1, 2, 80); printf("\nTo see more help on other available options, do:\n %s -h\n\n", argv[0]); esl_getopts_Destroy(go); exit(1); ERROR: if (go) esl_getopts_Destroy(go); exit(status); }
int main(int argc, char **argv) { int i,j; ESL_GETOPTS *go = NULL; /* command line processing */ ESL_STOPWATCH *w = esl_stopwatch_Create(); int status; ESL_MSA *msa = NULL; FILE *ofp = NULL; /* output file (default is stdout) */ ESL_ALPHABET *abc = NULL; /* digital alphabet */ char *alifile; /* name of the alignment file we're building HMMs from */ ESLX_MSAFILE *afp = NULL; /* open alifile */ int infmt = eslMSAFILE_UNKNOWN; /* autodetect alignment format by default. */ int outfmt = eslMSAFILE_STOCKHOLM; char *postmsafile; /* optional file to resave annotated, modified MSAs to */ FILE *postmsafp = NULL; /* open <postmsafile>, or NULL */ int mask_range_cnt = 0; uint32_t mask_starts[100]; // over-the-top allocation. uint32_t mask_ends[100]; char *rangestr; char *range; int *map = NULL; /* map[i]=j, means model position i comes from column j of the alignment; 1..alen */ int keep_mm; /* Set processor specific flags */ impl_Init(); alifile = NULL; postmsafile = NULL; /* Parse the command line */ process_commandline(argc, argv, &go, &alifile, &postmsafile); keep_mm = esl_opt_IsUsed(go, "--apendmask"); /* Initialize what we can in the config structure (without knowing the alphabet yet). * Fields controlled by masters are set up in usual_master() or mpi_master() * Fields used by workers are set up in mpi_worker() */ ofp = NULL; infmt = eslMSAFILE_UNKNOWN; afp = NULL; abc = NULL; if (esl_opt_IsOn(go, "--informat")) { infmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--informat")); if (infmt == eslMSAFILE_UNKNOWN) p7_Fail("%s is not a recognized input sequence file format\n", esl_opt_GetString(go, "--informat")); } /* Determine output alignment file format */ outfmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--outformat")); if (outfmt == eslMSAFILE_UNKNOWN) p7_Fail(argv[0], "%s is not a recognized output MSA file format\n", esl_opt_GetString(go, "--outformat")); /* Parse the ranges */ if (esl_opt_IsUsed(go, "--alirange")) { esl_strdup(esl_opt_GetString(go, "--alirange"), -1, &rangestr) ; } else if (esl_opt_IsUsed(go, "--modelrange")) { esl_strdup(esl_opt_GetString(go, "--modelrange"), -1, &rangestr) ; } else if (esl_opt_IsUsed(go, "--model2ali")) { esl_strdup(esl_opt_GetString(go, "--model2ali"), -1, &rangestr) ; } else if (esl_opt_IsUsed(go, "--ali2model")) { esl_strdup(esl_opt_GetString(go, "--ali2model"), -1, &rangestr) ; } else { if (puts("Must specify mask range with --modelrange, --alirange, --model2ali, or --ali2model\n") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto ERROR; } while ( (status = esl_strtok(&rangestr, ",", &range) ) == eslOK) { status = esl_regexp_ParseCoordString(range, mask_starts + mask_range_cnt, mask_ends + mask_range_cnt ); if (status == eslESYNTAX) esl_fatal("range flags take coords <from>..<to>; %s not recognized", range); if (status == eslFAIL) esl_fatal("Failed to find <from> or <to> coord in %s", range); mask_range_cnt++; } /* Start timing. */ esl_stopwatch_Start(w); /* Open files, set alphabet. * afp - open alignment file for input * abc - alphabet expected or guessed in ali file * postmsafp - open MSA output file * ofp - optional open output file, or stdout */ if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO); else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA); else abc = NULL; status = eslx_msafile_Open(&abc, alifile, NULL, infmt, NULL, &afp); if (status != eslOK) eslx_msafile_OpenFailure(afp, status); if (esl_opt_IsUsed(go, "--alirange") || esl_opt_IsUsed(go, "--modelrange") ) { postmsafp = fopen(postmsafile, "w"); if (postmsafp == NULL) p7_Fail("Failed to MSA output file %s for writing", postmsafile); } if (esl_opt_IsUsed(go, "-o")) { ofp = fopen(esl_opt_GetString(go, "-o"), "w"); if (ofp == NULL) p7_Fail("Failed to open -o output file %s\n", esl_opt_GetString(go, "-o")); } else ofp = stdout; /* Looks like the i/o is set up successfully... * Initial output to the user */ output_header(go, ofp, alifile, postmsafile); /* cheery output header */ /* read the alignment */ if ((status = eslx_msafile_Read(afp, &msa)) != eslOK) eslx_msafile_ReadFailure(afp, status); if (esl_opt_IsUsed(go, "--alirange") || esl_opt_IsUsed(go, "--modelrange") ) { /* add/modify mmline for the mask */ if (msa->mm == NULL) { ESL_ALLOC(msa->mm, msa->alen); keep_mm = FALSE; } if (!keep_mm) for (i=0; i<msa->alen; i++) msa->mm[i] = '.'; } // convert model coordinates to alignment coordinates, if necessary if (esl_opt_IsUsed(go, "--modelrange") || esl_opt_IsUsed(go, "--model2ali") || esl_opt_IsUsed(go, "--ali2model") ) { float symfrac = esl_opt_GetReal(go, "--symfrac"); int do_hand = esl_opt_IsOn(go, "--hand"); int L; //same as p7_builder relative_weights if (esl_opt_IsOn(go, "--wnone") ) { esl_vec_DSet(msa->wgt, msa->nseq, 1.); } else if (esl_opt_IsOn(go, "--wgiven") ) ; else if (esl_opt_IsOn(go, "--wpb") ) status = esl_msaweight_PB(msa); else if (esl_opt_IsOn(go, "--wgsc") ) status = esl_msaweight_GSC(msa); else if (esl_opt_IsOn(go, "--wblosum")) status = esl_msaweight_BLOSUM(msa, esl_opt_GetReal(go, "--wid")); if ((status = esl_msa_MarkFragments(msa, esl_opt_GetReal(go, "--fragthresh"))) != eslOK) goto ERROR; //build a map of model mask coordinates to alignment coords ESL_ALLOC(map, sizeof(int) * (msa->alen+1)); L = p7_Alimask_MakeModel2AliMap(msa, do_hand, symfrac, map ); if ( esl_opt_IsUsed(go, "--model2ali") ) { //print mapping printf ("model coordinates alignment coordinates\n"); for (i=0; i<mask_range_cnt; i++) printf ("%8d..%-8d -> %8d..%-8d\n", mask_starts[i], mask_ends[i], map[mask_starts[i]-1], map[mask_ends[i]-1]); /* If I wanted to, I could print all the map values independently: printf("\n\n-----------\n"); printf("Map\n"); printf("---\n"); for (i=0; i<L; i++) printf("%d -> %d\n", i+1, map[i]); */ } else if ( esl_opt_IsUsed(go, "--ali2model") ) { //print mapping (requires scanning the inverted map int alistart = 0; int aliend = 0; printf ("alignment coordinates model coordinates\n"); for (i=0; i<mask_range_cnt; i++) { //find j for ali positions while (map[alistart] < mask_starts[i] ) alistart++; aliend = alistart; while (map[aliend] < mask_ends[i] ) aliend++; printf (" %8d..%-8d -> %8d..%-8d\n", map[alistart], map[aliend], alistart+1, aliend+1); } } else { //convert the mask coords based on map for (i=0; i<mask_range_cnt; i++) { mask_starts[i] = map[mask_starts[i]-1]; //-1 because mmline is offset by one relative to the 1-base alignment mask_ends[i] = map[mask_ends[i]-1]; } } } if (esl_opt_IsUsed(go, "--alirange") || esl_opt_IsUsed(go, "--modelrange") ) { //overwrite '.' with 'm' everywhere the range says to do it for (i=0; i<mask_range_cnt; i++) for (j=mask_starts[i]; j<=mask_ends[i]; j++) msa->mm[j-1] = 'm'; if ((status = eslx_msafile_Write(postmsafp, msa, outfmt)) != eslOK) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); } esl_stopwatch_Stop(w); if (esl_opt_IsOn(go, "-o")) fclose(ofp); if (postmsafp) fclose(postmsafp); if (afp) eslx_msafile_Close(afp); if (abc) esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); esl_stopwatch_Destroy(w); return 0; ERROR: return eslFAIL; }
static int output_header(const ESL_GETOPTS *go, FILE *ofp, char *alifile, char *postmsafile) { p7_banner(ofp, go->argv[0], banner); if (fprintf(ofp, "# input alignment file: %s\n", alifile) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--alirange") || esl_opt_IsUsed(go, "--modelrange") ) { if (fprintf(ofp, "# output alignment file: %s\n", postmsafile) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); } if (esl_opt_IsUsed(go, "--alirange") && fprintf(ofp, "# alignment range: %s\n", esl_opt_GetString(go, "--alirange")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--modelrange") && fprintf(ofp, "# model range: %s\n", esl_opt_GetString(go, "--modelrange"))< 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--apendmask") && fprintf(ofp, "# add to existing mask: [on]\n" )< 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--model2ali") && fprintf(ofp, "# ali ranges for model range: %s\n", esl_opt_GetString(go, "--model2ali")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--ali2model") && fprintf(ofp, "# model ranges for ali range: %s\n", esl_opt_GetString(go, "--ali2model")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "-o") && fprintf(ofp, "# output directed to file: %s\n", esl_opt_GetString(go, "-o")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--amino") && fprintf(ofp, "# input alignment is asserted as: protein\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--dna") && fprintf(ofp, "# input alignment is asserted as: DNA\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--rna") && fprintf(ofp, "# input alignment is asserted as: RNA\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--fast") && fprintf(ofp, "# model architecture construction: fast/heuristic\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--hand") && fprintf(ofp, "# model architecture construction: hand-specified by RF annotation\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--symfrac") && fprintf(ofp, "# sym fraction for model structure: %.3f\n", esl_opt_GetReal(go, "--symfrac")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--fragthresh") && fprintf(ofp, "# seq called frag if L <= x*alen: %.3f\n", esl_opt_GetReal(go, "--fragthresh")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--wpb") && fprintf(ofp, "# relative weighting scheme: Henikoff PB\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--wgsc") && fprintf(ofp, "# relative weighting scheme: G/S/C\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--wblosum") && fprintf(ofp, "# relative weighting scheme: BLOSUM filter\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--wnone") && fprintf(ofp, "# relative weighting scheme: none\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--wid") && fprintf(ofp, "# frac id cutoff for BLOSUM wgts: %f\n", esl_opt_GetReal(go, "--wid")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--seed")) { if (esl_opt_GetInteger(go, "--seed") == 0 && fprintf(ofp,"# random number seed: one-time arbitrary\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); else if ( fprintf(ofp,"# random number seed set to: %d\n", esl_opt_GetInteger(go, "--seed")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); } if (fprintf(ofp, "# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); return eslOK; }
/* serial_master() * The serial version of hmmsearch. * For each query HMM in <hmmfile> search the database for hits. * * A master can only return if it's successful. All errors are handled * immediately and fatally with p7_Fail(). We also use the * ESL_EXCEPTION and ERROR: mechanisms, but only because we know we're * using a fatal exception handler. */ static int serial_master(ESL_GETOPTS *go, struct cfg_s *cfg) { FILE *ofp = stdout; /* results output file (-o) */ P7_HMMFILE *hfp = NULL; /* open input HMM file */ ESL_SQFILE *dbfp = NULL; /* open input sequence file */ P7_HMM *hmm = NULL; /* one HMM query */ ESL_ALPHABET *abc = NULL; /* digital alphabet */ int dbfmt = eslSQFILE_UNKNOWN; /* format code for sequence database file */ ESL_STOPWATCH *w; int textw = 0; int nquery = 0; int status = eslOK; int hstatus = eslOK; int sstatus = eslOK; int i; int ncpus = 0; int infocnt = 0; WORKER_INFO *info = NULL; char errbuf[eslERRBUFSIZE]; w = esl_stopwatch_Create(); if (esl_opt_GetBoolean(go, "--notextw")) textw = 0; else textw = esl_opt_GetInteger(go, "--textw"); if (esl_opt_IsOn(go, "--tformat")) { dbfmt = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--tformat")); if (dbfmt == eslSQFILE_UNKNOWN) p7_Fail("%s is not a recognized sequence database file format\n", esl_opt_GetString(go, "--tformat")); } /* Open the target sequence database */ status = esl_sqfile_Open(cfg->dbfile, dbfmt, p7_SEQDBENV, &dbfp); if (status == eslENOTFOUND) p7_Fail("Failed to open sequence file %s for reading\n", cfg->dbfile); else if (status == eslEFORMAT) p7_Fail("Sequence file %s is empty or misformatted\n", cfg->dbfile); else if (status == eslEINVAL) p7_Fail("Can't autodetect format of a stdin or .gz seqfile"); else if (status != eslOK) p7_Fail("Unexpected error %d opening sequence file %s\n", status, cfg->dbfile); if (esl_opt_IsUsed(go, "--restrictdb_stkey") || esl_opt_IsUsed(go, "--restrictdb_n")) { if (esl_opt_IsUsed(go, "--ssifile")) esl_sqfile_OpenSSI(dbfp, esl_opt_GetString(go, "--ssifile")); else esl_sqfile_OpenSSI(dbfp, NULL); } /* Open the query profile HMM file */ status = p7_hmmfile_OpenE(cfg->hmmfile, NULL, &hfp, errbuf); if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", cfg->hmmfile, errbuf); else if (status == eslEFORMAT) p7_Fail("File format problem in trying to open HMM file %s.\n%s\n", cfg->hmmfile, errbuf); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, cfg->hmmfile, errbuf); /* Open the results output files */ if (esl_opt_IsOn(go, "-o")) { if ((ofp = fopen(esl_opt_GetString(go, "-o"), "w+")) == NULL) p7_Fail("Failed to open output file %s for writing\n", esl_opt_GetString(go, "-o")); } infocnt = 1; ESL_ALLOC(info, sizeof(*info) * infocnt); /* <abc> is not known 'til first HMM is read. */ hstatus = p7_hmmfile_Read(hfp, &abc, &hmm); if (hstatus == eslOK) { /* One-time initializations after alphabet <abc> becomes known */ // output_header(ofp, go, cfg->hmmfile, cfg->dbfile); // dbfp->abc = abc; //ReadBlock requires knowledge of the alphabet to decide how best to read blocks // for (i = 0; i < infocnt; ++i) // { // info[i].bg = p7_bg_Create(abc); // } } /* Outer loop: over each query HMM in <hmmfile>. */ // while (hstatus == eslOK) // { P7_PROFILE *gm = NULL; P7_OPROFILE *om = NULL; /* optimized query profile */ nquery++; esl_stopwatch_Start(w); /* seqfile may need to be rewound (multiquery mode) */ if (nquery > 1) { if (! esl_sqfile_IsRewindable(dbfp)) esl_fatal("Target sequence file %s isn't rewindable; can't search it with multiple queries", cfg->dbfile); if (! esl_opt_IsUsed(go, "--restrictdb_stkey") ) esl_sqfile_Position(dbfp, 0); //only re-set current position to 0 if we're not planning to set it in a moment } if ( cfg->firstseq_key != NULL ) { //it's tempting to want to do this once and capture the offset position for future passes, but ncbi files make this non-trivial, so this keeps it general sstatus = esl_sqfile_PositionByKey(dbfp, cfg->firstseq_key); if (sstatus != eslOK) p7_Fail("Failure setting restrictdb_stkey to %d\n", cfg->firstseq_key); } // if (fprintf(ofp, "Query: %s [M=%d]\n", hmm->name, hmm->M) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); // if (hmm->acc) { if (fprintf(ofp, "Accession: %s\n", hmm->acc) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); } // if (hmm->desc) { if (fprintf(ofp, "Description: %s\n", hmm->desc) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); } /* Convert to an optimized model */ gm = p7_profile_Create (hmm->M, abc); om = p7_oprofile_Create(hmm->M, abc); // p7_ProfileConfig(hmm, info->bg, gm, 100, p7_LOCAL); /* 100 is a dummy length for now; and MSVFilter requires local mode */ p7_oprofile_Convert(gm, om); /* <om> is now p7_LOCAL, multihit */ for (i = 0; i < infocnt; ++i) { /* Create processing pipeline and hit list */ info[i].th = p7_tophits_Create(); info[i].om = p7_oprofile_Clone(om); info[i].pli = p7_pipeline_Create(go, om->M, 100, FALSE, p7_SEARCH_SEQS); /* L_hint = 100 is just a dummy for now */ P7_PIPELINE *pli = info[i].pli; pli->nmodels++; pli->nnodes += info[i].om->M; // if (pli->Z_setby == p7_ZSETBY_NTARGETS && pli->mode == p7_SCAN_MODELS) pli->Z = pli->nmodels; // if (pli->do_biasfilter) p7_bg_SetFilter(info[i].bg, info[i].om->M, info[i].om->compo); // if (pli->mode == p7_SEARCH_SEQS) // status = p7_pli_NewModelThresholds(pli, info[i].om); pli->W = info[i].om->max_length; } sstatus = serial_loop(info, dbfp, cfg->n_targetseq, ofp); switch(sstatus) { case eslEFORMAT: esl_fatal("Parse failed (sequence file %s):\n%s\n", dbfp->filename, esl_sqfile_GetErrorBuf(dbfp)); break; case eslEOF: /* do nothing */ break; default: esl_fatal("Unexpected error %d reading sequence file %s", sstatus, dbfp->filename); } /* merge the results of the search results */ for (i = 1; i < infocnt; ++i) { p7_tophits_Merge(info[0].th, info[i].th); p7_pipeline_Merge(info[0].pli, info[i].pli); p7_pipeline_Destroy(info[i].pli); p7_tophits_Destroy(info[i].th); p7_oprofile_Destroy(info[i].om); } /* Print the results. */ p7_tophits_SortBySortkey(info->th); p7_tophits_Threshold(info->th, info->pli); // p7_tophits_Targets(ofp, info->th, info->pli, textw); if (fprintf(ofp, "\n\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); // p7_tophits_Domains(ofp, info->th, info->pli, textw); if (fprintf(ofp, "\n\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); esl_stopwatch_Stop(w); // p7_pli_Statistics(ofp, info->pli, w); // if (fprintf(ofp, "//\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); p7_pipeline_Destroy(info->pli); p7_tophits_Destroy(info->th); p7_oprofile_Destroy(info->om); p7_oprofile_Destroy(om); p7_profile_Destroy(gm); p7_hmm_Destroy(hmm); // hstatus = p7_hmmfile_Read(hfp, &abc, &hmm); // } /* end outer loop over query HMMs */ switch(hstatus) { case eslEOD: p7_Fail("read failed, HMM file %s may be truncated?", cfg->hmmfile); break; case eslEFORMAT: p7_Fail("bad file format in HMM file %s", cfg->hmmfile); break; case eslEINCOMPAT: p7_Fail("HMM file %s contains different alphabets", cfg->hmmfile); break; case eslEOF: case eslOK: /* do nothing. EOF is what we want. */ break; default: p7_Fail("Unexpected error (%d) in reading HMMs from %s", hstatus, cfg->hmmfile); } /* Terminate outputs... any last words? */ /* Cleanup - prepare for exit */ // for (i = 0; i < infocnt; ++i) // p7_bg_Destroy(info[i].bg); free(info); p7_hmmfile_Close(hfp); esl_sqfile_Close(dbfp); esl_alphabet_Destroy(abc); esl_stopwatch_Destroy(w); if (ofp != stdout) fclose(ofp); printf("44HHHH \n"); return eslOK; ERROR: return eslFAIL; }
static int output_header(FILE *ofp, const ESL_GETOPTS *go, char *hmmfile, char *seqfile) { p7_banner(ofp, go->argv[0], banner); if (fprintf(ofp, "# query HMM file: %s\n", hmmfile) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (fprintf(ofp, "# target sequence database: %s\n", seqfile) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "-o") && fprintf(ofp, "# output directed to file: %s\n", esl_opt_GetString(go, "-o")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "-A") && fprintf(ofp, "# MSA of all hits saved to file: %s\n", esl_opt_GetString(go, "-A")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--tblout") && fprintf(ofp, "# per-seq hits tabular output: %s\n", esl_opt_GetString(go, "--tblout")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--domtblout") && fprintf(ofp, "# per-dom hits tabular output: %s\n", esl_opt_GetString(go, "--domtblout")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--pfamtblout") && fprintf(ofp, "# pfam-style tabular hit output: %s\n", esl_opt_GetString(go, "--pfamtblout")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--acc") && fprintf(ofp, "# prefer accessions over names: yes\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--noali") && fprintf(ofp, "# show alignments in output: no\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--notextw") && fprintf(ofp, "# max ASCII text line length: unlimited\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--textw") && fprintf(ofp, "# max ASCII text line length: %d\n", esl_opt_GetInteger(go, "--textw")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "-E") && fprintf(ofp, "# sequence reporting threshold: E-value <= %g\n", esl_opt_GetReal(go, "-E")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "-T") && fprintf(ofp, "# sequence reporting threshold: score >= %g\n", esl_opt_GetReal(go, "-T")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--domE") && fprintf(ofp, "# domain reporting threshold: E-value <= %g\n", esl_opt_GetReal(go, "--domE")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--domT") && fprintf(ofp, "# domain reporting threshold: score >= %g\n", esl_opt_GetReal(go, "--domT")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--incE") && fprintf(ofp, "# sequence inclusion threshold: E-value <= %g\n", esl_opt_GetReal(go, "--incE")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--incT") && fprintf(ofp, "# sequence inclusion threshold: score >= %g\n", esl_opt_GetReal(go, "--incT")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--incdomE") && fprintf(ofp, "# domain inclusion threshold: E-value <= %g\n", esl_opt_GetReal(go, "--incdomE")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--incdomT") && fprintf(ofp, "# domain inclusion threshold: score >= %g\n", esl_opt_GetReal(go, "--incdomT")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--cut_ga") && fprintf(ofp, "# model-specific thresholding: GA cutoffs\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--cut_nc") && fprintf(ofp, "# model-specific thresholding: NC cutoffs\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--cut_tc") && fprintf(ofp, "# model-specific thresholding: TC cutoffs\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--max") && fprintf(ofp, "# Max sensitivity mode: on [all heuristic filters off]\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--F1") && fprintf(ofp, "# MSV filter P threshold: <= %g\n", esl_opt_GetReal(go, "--F1")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--F2") && fprintf(ofp, "# Vit filter P threshold: <= %g\n", esl_opt_GetReal(go, "--F2")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--F3") && fprintf(ofp, "# Fwd filter P threshold: <= %g\n", esl_opt_GetReal(go, "--F3")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--nobias") && fprintf(ofp, "# biased composition HMM filter: off\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--restrictdb_stkey") && fprintf(ofp, "# Restrict db to start at seq key: %s\n", esl_opt_GetString(go, "--restrictdb_stkey")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--restrictdb_n") && fprintf(ofp, "# Restrict db to # target seqs: %d\n", esl_opt_GetInteger(go, "--restrictdb_n")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--ssifile") && fprintf(ofp, "# Override ssi file to: %s\n", esl_opt_GetString(go, "--ssifile")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--nonull2") && fprintf(ofp, "# null2 bias corrections: off\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "-Z") && fprintf(ofp, "# sequence search space set to: %.0f\n", esl_opt_GetReal(go, "-Z")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--domZ") && fprintf(ofp, "# domain search space set to: %.0f\n", esl_opt_GetReal(go, "--domZ")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--seed")) { if (esl_opt_GetInteger(go, "--seed") == 0 && fprintf(ofp, "# random number seed: one-time arbitrary\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); else if ( fprintf(ofp, "# random number seed set to: %d\n", esl_opt_GetInteger(go, "--seed")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); } if (esl_opt_IsUsed(go, "--tformat") && fprintf(ofp, "# targ <seqfile> format asserted: %s\n", esl_opt_GetString(go, "--tformat")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (fprintf(ofp, "# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); return eslOK; }
/* Function: main() * Synopsis: break input sequence set into chunks, for each one building the * Burrows-Wheeler transform and corresponding FM-index. Maintain requisite * meta data. * Notes: Currently depends on the divsufsort-lite code of Yuta Mori, though this * could easily be replaced. */ int main(int argc, char **argv) { char tmp_filename[16] = "fmtmpXXXXXX"; FILE *fptmp = NULL; FILE *fp = NULL; uint8_t *T = NULL; uint8_t *BWT = NULL; int *SA = NULL; //what I write will be 32-bit ints, but I need to keep this as int so it'll work with libdivsufsort uint32_t *SAsamp = NULL; uint32_t *occCnts_sb = NULL; // same indexing as above uint32_t *cnts_sb = NULL; uint16_t *occCnts_b = NULL; // this is logically a 2D array, but will be indexed as occ_cnts[alph_size*index + char] (instead of occ_cnts[index][char]) uint16_t *cnts_b = NULL; FM_METADATA *meta = NULL; clock_t t1, t2; struct tms ts1, ts2; long i,j,c; int status = eslOK; int chars_per_byte; int num_freq_cnts_sb ; int num_freq_cnts_b ; int num_SA_samples ; int infmt = eslSQFILE_UNKNOWN; int alphatype = eslUNKNOWN; int alphaguess =eslUNKNOWN; ESL_ALPHABET *abc = NULL; ESL_SQ *sq = NULL; ESL_SQFILE *sqfp = NULL; ESL_SQ *tmpsq = NULL; ESL_SQ_BLOCK *block = NULL; char *fname_in = NULL; char *fname_out= NULL; int block_size = 50000000; int sq_cnt = 0; int use_tmpsq = 0; uint64_t block_length; uint64_t total_char_count = 0; int max_block_size; int numblocks = 0; uint32_t numseqs = 0; int allocedseqs = 1000; uint32_t seq_offset = 0; uint32_t ambig_offset = 0; uint32_t overlap = 0; uint16_t seq_cnt; uint16_t ambig_cnt; uint32_t prev_numseqs = 0; int compressed_bytes; uint32_t term_loc; ESL_GETOPTS *go = NULL; /* command line processing */ uint8_t ambig_repl = 0; int in_ambig_run = 0; FM_AMBIGLIST ambig_list; ESL_ALLOC (meta, sizeof(FM_METADATA)); if (meta == NULL) esl_fatal("unable to allocate memory to store FM meta data\n"); ESL_ALLOC (meta->ambig_list, sizeof(FM_AMBIGLIST)); if (meta->ambig_list == NULL) esl_fatal("unable to allocate memory to store FM ambiguity data\n"); fm_initAmbiguityList(meta->ambig_list); meta->alph_type = fm_DNA; meta->freq_SA = 8; meta->freq_cnt_b = 256; meta->freq_cnt_sb = pow(2,16); //65536 - that's the # values in a short meta->seq_count = 0; ESL_ALLOC (meta->seq_data, allocedseqs * sizeof(FM_SEQDATA)); if (meta->seq_data == NULL ) esl_fatal("unable to allocate memory to store FM sequence data\n"); process_commandline(argc, argv, &go, &fname_in, &fname_out); if (esl_opt_IsOn(go, "--bin_length")) meta->freq_cnt_b = esl_opt_GetInteger(go, "--bin_length"); if ( meta->freq_cnt_b < 32 || meta->freq_cnt_b >4096 || (meta->freq_cnt_b & (meta->freq_cnt_b - 1)) ) // test power of 2 esl_fatal("bin_length must be a power of 2, at least 128, and at most 4096\n"); if (esl_opt_IsOn(go, "--sa_freq")) meta->freq_SA = esl_opt_GetInteger(go, "--sa_freq"); if ( (meta->freq_SA & (meta->freq_SA - 1)) ) // test power of 2 esl_fatal ("SA_freq must be a power of 2\n"); if (esl_opt_IsOn(go, "--block_size")) block_size = 1000000 * esl_opt_GetInteger(go, "--block_size"); if ( block_size <=0 ) esl_fatal ("block_size must be a positive number\n"); //start timer t1 = times(&ts1); output_header(stdout, go, fname_in, fname_out); if (esl_opt_GetString(go, "--informat") != NULL) { infmt = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--informat")); if (infmt == eslSQFILE_UNKNOWN) esl_fatal("%s is not a valid input sequence file format for --informat"); } status = esl_sqfile_Open(fname_in, infmt, NULL, &sqfp); if (status == eslENOTFOUND) esl_fatal("No such file %s", fname_in); else if (status == eslEFORMAT) esl_fatal("Format of seqfile %s unrecognized.", fname_in); else if (status != eslOK) esl_fatal("Open failed, code %d.", status); meta->fwd_only = 0; if (esl_opt_IsUsed(go, "--alph")) { meta->alph = esl_opt_GetString(go, "--alph") ; if ( esl_strcmp(meta->alph, "dna")==0 || esl_strcmp(meta->alph, "rna")==0) { meta->alph_type = fm_DNA; alphatype = eslDNA; } else if (esl_strcmp(meta->alph, "dna_full")==0 || esl_strcmp(meta->alph, "rna_full")==0) { meta->alph_type = fm_DNA_full; alphatype = eslDNA; } else if (esl_strcmp(meta->alph, "amino")==0) { meta->alph_type = fm_AMINO; alphatype = eslAMINO; meta->fwd_only = 1; } else { esl_fatal("Unknown alphabet type. Try 'dna', 'dna_full', or 'amino'\n%s", ""); } } else { esl_sqfile_GuessAlphabet(sqfp, &alphaguess); if (alphaguess == eslDNA || alphaguess == eslRNA) { meta->alph_type = fm_DNA; alphatype = eslDNA; } else if (alphaguess == eslAMINO) { meta->alph_type = fm_AMINO; alphatype = eslAMINO; meta->fwd_only = 1; } else { esl_fatal("Unknown alphabet type. Try 'dna', 'dna_full', or 'amino'\n%s", ""); } } if (esl_opt_IsOn(go, "--fwd_only") ) meta->fwd_only = 1; meta->alph = NULL; //getInverseAlphabet fm_alphabetCreate(meta, &(meta->charBits)); chars_per_byte = 8/meta->charBits; //shift inv_alph up one, to make space for '$' at 0 for (i=0; i<256; i++) if ( meta->inv_alph[i] >= 0) meta->inv_alph[i]++; abc = esl_alphabet_Create(alphatype); sq = esl_sq_CreateDigital(abc); tmpsq = esl_sq_CreateDigital(abc); esl_sqfile_SetDigital(sqfp, abc); block = esl_sq_CreateDigitalBlock(FM_BLOCK_COUNT, abc); block->complete = FALSE; // max_block_size = FM_BLOCK_OVERLAP+block_size+1 + block_size*.2; // +1 for the '$' max_block_size = FM_BLOCK_OVERLAP+block_size+1 + block_size; // temporary hack to avoid memory over-runs (see end of 1101_fmindex_benchmarking/00NOTES) if (alphatype == fm_DNA) fm_initAmbiguityList(&ambig_list); /* Allocate BWT, Text, SA, and FM-index data structures, allowing storage of maximally large sequence*/ ESL_ALLOC (T, max_block_size * sizeof(uint8_t)); ESL_ALLOC (BWT, max_block_size * sizeof(uint8_t)); ESL_ALLOC (SA, max_block_size * sizeof(int)); ESL_ALLOC (SAsamp, (1+floor((double)max_block_size/meta->freq_SA) ) * sizeof(uint32_t)); ESL_ALLOC (occCnts_sb, (1+ceil((double)max_block_size/meta->freq_cnt_sb)) * meta->alph_size * sizeof(uint32_t)); // every freq_cnt_sb positions, store an array of ints ESL_ALLOC (cnts_sb, meta->alph_size * sizeof(uint32_t)); ESL_ALLOC (occCnts_b, ( 1+ceil((double)max_block_size/meta->freq_cnt_b)) * meta->alph_size * sizeof(uint16_t)); // every freq_cnt_b positions, store an array of 8-byte ints ESL_ALLOC (cnts_b, meta->alph_size * sizeof(uint16_t)); if((T == NULL) || (BWT == NULL) || (SA==NULL) || (SAsamp==NULL) || (BWT==NULL) || (cnts_b==NULL) || (occCnts_b==NULL) || (cnts_sb==NULL) || (occCnts_sb==NULL) ) { esl_fatal( "%s: Cannot allocate memory.\n", argv[0]); } // Open a temporary file, to which FM-index data will be written if (esl_tmpfile(tmp_filename, &fptmp) != eslOK) esl_fatal("unable to open fm-index tmpfile"); /* Main loop: */ while (status == eslOK ) { //reset block as an empty vessel for (i=0; i<block->count; i++) esl_sq_Reuse(block->list + i); if (use_tmpsq) { esl_sq_Copy(tmpsq , block->list); block->complete = FALSE; //this lets ReadBlock know that it needs to append to a small bit of previously-read seqeunce block->list->C = FM_BLOCK_OVERLAP; // overload the ->C value, which ReadBlock uses to determine how much // overlap should be retained in the ReadWindow step } else { block->complete = TRUE; } status = esl_sqio_ReadBlock(sqfp, block, block_size, -1, alphatype != eslAMINO); if (status == eslEOF) continue; if (status != eslOK) ESL_XEXCEPTION(status, "failure reading sequence block"); seq_offset = numseqs; ambig_offset = meta->ambig_list->count; if (block->complete || block->count == 0) { use_tmpsq = FALSE; } else { /* The final sequence on the block was a probably-incomplete window of the active sequence. * Grab a copy of the end for use in the next pass, to ensure we don't miss hits crossing * the boundary between two blocks. */ esl_sq_Copy(block->list + (block->count - 1) , tmpsq); use_tmpsq = TRUE; } block->first_seqidx = sq_cnt; sq_cnt += block->count - (use_tmpsq ? 1 : 0);// if there's an incomplete sequence read into the block wait to count it until it's complete. /* Read dseqs from block into text element T. * Convert the dsq from esl-alphabet to fm-alphabet (1..k for alphabet of size k). * (a) collapsing upper/lower case for appropriate sorting. * (b) reserving 0 for '$', which must be lexicographically smallest * (these will later be shifted to 0-based alphabet, once SA has been built) * */ block_length = 0; for (i=0; i<block->count; i++) { //start a new block, with space for the name allocateSeqdata(meta, block->list+i, numseqs, &allocedseqs); //meta data meta->seq_data[numseqs].target_id = block->first_seqidx + i ; meta->seq_data[numseqs].target_start = block->list[i].start; meta->seq_data[numseqs].fm_start = block_length; if (block->list[i].name == NULL) meta->seq_data[numseqs].name[0] = '\0'; else strcpy(meta->seq_data[numseqs].name, block->list[i].name ); if (block->list[i].acc == NULL) meta->seq_data[numseqs].acc[0] = '\0'; else strcpy(meta->seq_data[numseqs].acc, block->list[i].acc ); if (block->list[i].source == NULL) meta->seq_data[numseqs].source[0] = '\0'; else strcpy(meta->seq_data[numseqs].source, block->list[i].source ); if (block->list[i].desc == NULL) meta->seq_data[numseqs].desc[0] = '\0'; else strcpy(meta->seq_data[numseqs].desc, block->list[i].desc ); for (j=1; j<=block->list[i].n; j++) { c = abc->sym[block->list[i].dsq[j]]; if ( meta->alph_type == fm_DNA) { if (meta->inv_alph[c] == -1) { // replace ambiguity characters by rotating through A,C,G, and T. c = meta->alph[ambig_repl]; ambig_repl = (ambig_repl+1)%4; if (!in_ambig_run) { fm_addAmbiguityRange(meta->ambig_list, block_length, block_length); in_ambig_run=1; } else { meta->ambig_list->ranges[meta->ambig_list->count - 1].upper = block_length; } } else { in_ambig_run=0; } } else if (meta->inv_alph[c] == -1) { esl_fatal("requested alphabet doesn't match input text\n"); } T[block_length] = meta->inv_alph[c]; block_length++; if (j>block->list[i].C) total_char_count++; // add to total count, only if it's not redundant with earlier read meta->seq_data[numseqs].length++; } numseqs++; } T[block_length] = 0; // last character 0 is effectively '$' for suffix array block_length++; seq_cnt = numseqs-seq_offset; ambig_cnt = meta->ambig_list->count - ambig_offset; //build and write FM-index for T. This will be a BWT on the reverse of the sequence, required for reverse-traversal of the BWT buildAndWriteFMIndex(meta, seq_offset, ambig_offset, seq_cnt, ambig_cnt, (uint32_t)block->list[0].C, T, BWT, SA, SAsamp, occCnts_sb, cnts_sb, occCnts_b, cnts_b, block_length, fptmp); if ( ! meta->fwd_only ) { //build and write FM-index for un-reversed T (used to find reverse hits using forward traversal of the BWT buildAndWriteFMIndex(meta, seq_offset, ambig_offset, seq_cnt, ambig_cnt, 0, T, BWT, SA, NULL, occCnts_sb, cnts_sb, occCnts_b, cnts_b, block_length, fptmp); } prev_numseqs = numseqs; numblocks++; } esl_sqfile_Close(sqfp); esl_alphabet_Destroy(abc); esl_sq_Destroy(sq); esl_sq_Destroy(tmpsq); esl_sq_DestroyBlock(block); meta->seq_count = numseqs; meta->block_count = numblocks; /* Finished writing the FM-index data to a temporary file. Now write * metadata to fname_out, than append FM-index data from temp file */ if((fp = fopen(fname_out, "wb")) == NULL) esl_fatal( "%s: Cannot open file `%s': ", argv[0], fname_out); //write out meta data if( fwrite(&(meta->fwd_only), sizeof(meta->fwd_only), 1, fp) != 1 || fwrite(&(meta->alph_type), sizeof(meta->alph_type), 1, fp) != 1 || fwrite(&(meta->alph_size), sizeof(meta->alph_size), 1, fp) != 1 || fwrite(&(meta->charBits), sizeof(meta->charBits), 1, fp) != 1 || fwrite(&(meta->freq_SA), sizeof(meta->freq_SA), 1, fp) != 1 || fwrite(&(meta->freq_cnt_sb), sizeof(meta->freq_cnt_sb), 1, fp) != 1 || fwrite(&(meta->freq_cnt_b), sizeof(meta->freq_cnt_b), 1, fp) != 1 || fwrite(&(meta->block_count), sizeof(meta->block_count), 1, fp) != 1 || fwrite(&(meta->seq_count), sizeof(meta->seq_count), 1, fp) != 1 || fwrite(&(meta->ambig_list->count), sizeof(meta->ambig_list->count), 1, fp) != 1 || fwrite(&total_char_count, sizeof(total_char_count), 1, fp) != 1 ) esl_fatal( "%s: Error writing meta data for FM index.\n", argv[0]); for (i=0; i<meta->seq_count; i++) { if( fwrite(&(meta->seq_data[i].target_id), sizeof(meta->seq_data[i].target_id), 1, fp) != 1 || fwrite(&(meta->seq_data[i].target_start), sizeof(meta->seq_data[i].target_start), 1, fp) != 1 || fwrite(&(meta->seq_data[i].fm_start), sizeof(meta->seq_data[i].fm_start), 1, fp) != 1 || fwrite(&(meta->seq_data[i].length), sizeof(meta->seq_data[i].length), 1, fp) != 1 || fwrite(&(meta->seq_data[i].name_length), sizeof(meta->seq_data[i].name_length), 1, fp) != 1 || fwrite(&(meta->seq_data[i].acc_length), sizeof(meta->seq_data[i].acc_length), 1, fp) != 1 || fwrite(&(meta->seq_data[i].source_length),sizeof(meta->seq_data[i].source_length), 1, fp) != 1 || fwrite(&(meta->seq_data[i].desc_length), sizeof(meta->seq_data[i].desc_length), 1, fp) != 1 || fwrite(meta->seq_data[i].name, sizeof(char), meta->seq_data[i].name_length+1 , fp) != meta->seq_data[i].name_length+1 || fwrite(meta->seq_data[i].acc, sizeof(char), meta->seq_data[i].acc_length+1 , fp) != meta->seq_data[i].acc_length+1 || fwrite(meta->seq_data[i].source, sizeof(char), meta->seq_data[i].source_length+1, fp) != meta->seq_data[i].source_length+1 || fwrite(meta->seq_data[i].desc, sizeof(char), meta->seq_data[i].desc_length+1 , fp) != meta->seq_data[i].desc_length+1 ) esl_fatal( "%s: Error writing meta data for FM index.\n", argv[0]); } for (i=0; i<meta->ambig_list->count; i++) { if( fwrite(&(meta->ambig_list->ranges[i].lower), sizeof(meta->ambig_list->ranges[i].lower), 1, fp) != 1 || fwrite(&(meta->ambig_list->ranges[i].upper), sizeof(meta->ambig_list->ranges[i].upper), 1, fp) != 1 ) esl_fatal( "%s: Error writing ambiguity data for FM index.\n", argv[0]); } /* now append the FM-index data in fptmp to the desired output file, fp */ rewind(fptmp); for (i=0; i<numblocks; i++) { for(j=0; j< (meta->fwd_only?1:2); j++ ) { //do this once or twice, once for forward-T index, and possibly once for reversed //first, read if(fread(&block_length, sizeof(block_length), 1, fptmp) != 1) esl_fatal( "%s: Error reading block_length in FM index.\n", argv[0]); if(fread(&term_loc, sizeof(term_loc), 1, fptmp) != 1) esl_fatal( "%s: Error reading terminal location in FM index.\n", argv[0]); if(fread(&seq_offset, sizeof(seq_offset), 1, fptmp) != 1) esl_fatal( "%s: Error reading seq_offset in FM index.\n", argv[0]); if(fread(&ambig_offset, sizeof(ambig_offset ), 1, fptmp) != 1) esl_fatal( "%s: Error reading ambig_offset in FM index.\n", argv[0]); if(fread(&overlap, sizeof(overlap), 1, fptmp) != 1) esl_fatal( "%s: Error reading overlap in FM index.\n", argv[0]); if(fread(&seq_cnt, sizeof(seq_cnt), 1, fptmp) != 1) esl_fatal( "%s: Error reading seq_cnt in FM index.\n", argv[0]); if(fread(&ambig_cnt, sizeof(ambig_cnt), 1, fptmp) != 1) esl_fatal( "%s: Error reading ambig_cnt in FM index.\n", argv[0]); compressed_bytes = ((chars_per_byte-1+block_length)/chars_per_byte); num_freq_cnts_b = 1+ceil((double)block_length/meta->freq_cnt_b); num_freq_cnts_sb = 1+ceil((double)block_length/meta->freq_cnt_sb); num_SA_samples = 1+floor((double)block_length/meta->freq_SA); //j==0 test cause T and SA to be written only for forward sequence if(j==0 && fread(T, sizeof(uint8_t), compressed_bytes, fptmp) != compressed_bytes) esl_fatal( "%s: Error reading T in FM index.\n", argv[0]); if(fread(BWT, sizeof(uint8_t), compressed_bytes, fptmp) != compressed_bytes) esl_fatal( "%s: Error reading BWT in FM index.\n", argv[0]); if(j==0 && fread(SAsamp, sizeof(uint32_t), (size_t)num_SA_samples, fptmp) != (size_t)num_SA_samples) esl_fatal( "%s: Error reading SA in FM index.\n", argv[0]); if(fread(occCnts_b, sizeof(uint16_t)*(meta->alph_size), (size_t)num_freq_cnts_b, fptmp) != (size_t)num_freq_cnts_b) esl_fatal( "%s: Error reading occCnts_b in FM index.\n", argv[0]); if(fread(occCnts_sb, sizeof(uint32_t)*(meta->alph_size), (size_t)num_freq_cnts_sb, fptmp) != (size_t)num_freq_cnts_sb) esl_fatal( "%s: Error reading occCnts_sb in FM index.\n", argv[0]); //then, write if(fwrite(&block_length, sizeof(block_length), 1, fp) != 1) esl_fatal( "%s: Error writing block_length in FM index.\n", argv[0]); if(fwrite(&term_loc, sizeof(term_loc), 1, fp) != 1) esl_fatal( "%s: Error writing terminal location in FM index.\n", argv[0]); if(fwrite(&seq_offset, sizeof(seq_offset), 1, fp) != 1) esl_fatal( "%s: Error writing seq_offset in FM index.\n", argv[0]); if(fwrite(&ambig_offset, sizeof(ambig_offset), 1, fp) != 1) esl_fatal( "%s: Error writing ambig_offset in FM index.\n", argv[0]); if(fwrite(&overlap, sizeof(overlap), 1, fp) != 1) esl_fatal( "%s: Error writing overlap in FM index.\n", argv[0]); if(fwrite(&seq_cnt, sizeof(seq_cnt), 1, fp) != 1) esl_fatal( "%s: Error writing seq_cnt in FM index.\n", argv[0]); if(fwrite(&ambig_cnt, sizeof(ambig_cnt), 1, fp) != 1) esl_fatal( "%s: Error writing ambig_cnt in FM index.\n", argv[0]); if(j==0 && fwrite(T, sizeof(uint8_t), compressed_bytes, fp) != compressed_bytes) esl_fatal( "%s: Error writing T in FM index.\n", argv[0]); if(fwrite(BWT, sizeof(uint8_t), compressed_bytes, fp) != compressed_bytes) esl_fatal( "%s: Error writing BWT in FM index.\n", argv[0]); if(j==0 && fwrite(SAsamp, sizeof(uint32_t), (size_t)num_SA_samples, fp) != (size_t)num_SA_samples) esl_fatal( "%s: Error writing SA in FM index.\n", argv[0]); if(fwrite(occCnts_b, sizeof(uint16_t)*(meta->alph_size), (size_t)num_freq_cnts_b, fp) != (size_t)num_freq_cnts_b) esl_fatal( "%s: Error writing occCnts_b in FM index.\n", argv[0]); if(fwrite(occCnts_sb, sizeof(uint32_t)*(meta->alph_size), (size_t)num_freq_cnts_sb, fp) != (size_t)num_freq_cnts_sb) esl_fatal( "%s: Error writing occCnts_sb in FM index.\n", argv[0]); } } fprintf (stderr, "Number of characters in index: %ld\n", (long)total_char_count); fprintf (stderr, "Number of FM-index blocks: %ld\n", (long)meta->block_count); fclose(fp); fclose(fptmp); free(T); free(BWT); free(SA); free(SAsamp); free(occCnts_b); free(cnts_b); free(occCnts_sb); free(cnts_sb); fm_metaDestroy(meta); esl_getopts_Destroy(go); // compute and print the elapsed time in millisec t2 = times(&ts2); { double clk_ticks = sysconf(_SC_CLK_TCK); double elapsedTime = (t2-t1)/clk_ticks; fprintf (stderr, "run time: %.2f seconds\n", elapsedTime); } return (eslOK); ERROR: /* Deallocate memory. */ if (fp) fclose(fp); if (T) free(T); if (BWT) free(BWT); if (SA) free(SA); if (SAsamp) free(SAsamp); if (occCnts_b) free(occCnts_b); if (cnts_b) free(cnts_b); if (occCnts_sb) free(occCnts_sb); if (cnts_sb) free(cnts_sb); if (ambig_list.ranges) free(ambig_list.ranges); fm_metaDestroy(meta); esl_getopts_Destroy(go); esl_sqfile_Close(sqfp); esl_alphabet_Destroy(abc); esl_sq_Destroy(sq); if (tmpsq) esl_sq_Destroy(tmpsq); if (block) esl_sq_DestroyBlock(block); fprintf (stderr, "failure during memory allocation\n"); exit(status); }