int p7_hit_Compare(const P7_HIT *h1, const P7_HIT *h2, float tol) { int d; int status; if ( strcmp(h1->name, h2->name) != 0) return eslFAIL; if (esl_strcmp(h1->acc, h2->acc) != 0) return eslFAIL; if (esl_strcmp(h1->desc, h2->desc) != 0) return eslFAIL; if ( h1->window_length != h2->window_length) return eslFAIL; if ( h1->ndom != h2->ndom) return eslFAIL; if ( h1->noverlaps != h2->noverlaps) return eslFAIL; if ( h1->flags != h2->flags) return eslFAIL; if ( h1->nreported != h2->nreported) return eslFAIL; if ( h1->nincluded != h2->nincluded) return eslFAIL; if ( h1->best_domain != h2->best_domain) return eslFAIL; if ( h1->seqidx != h2->seqidx) return eslFAIL; if ( h1->subseq_start != h2->subseq_start) return eslFAIL; if ( h1->offset != h2->offset) return eslFAIL; if ( esl_DCompare( h1->sortkey, h2->sortkey, tol ) != eslOK) return eslFAIL; if ( esl_FCompare( h1->score, h2->score, tol ) != eslOK) return eslFAIL; if ( esl_FCompare( h1->pre_score, h2->pre_score, tol ) != eslOK) return eslFAIL; if ( esl_FCompare( h1->sum_score, h2->sum_score, tol ) != eslOK) return eslFAIL; if ( esl_DCompare( h1->lnP, h2->lnP, tol ) != eslOK) return eslFAIL; if ( esl_DCompare( h1->pre_lnP, h2->pre_lnP, tol ) != eslOK) return eslFAIL; if ( esl_DCompare( h1->sum_lnP, h2->sum_lnP, tol ) != eslOK) return eslFAIL; if ( esl_DCompare( h1->nexpected, h2->nexpected, tol ) != eslOK) return eslFAIL; for (d = 0; d < h1->ndom; d++) if (( status = p7_domain_Compare(&(h1->dcl[d]), &(h2->dcl[d]), tol)) != eslOK) return status; return eslOK; }
static int process_commandline(int argc, char **argv, ESL_GETOPTS **ret_go, char **ret_fmfile, char **ret_qfile) { ESL_GETOPTS *go = esl_getopts_Create(options); int status; if (esl_opt_ProcessEnvironment(go) != eslOK) { if (printf("Failed to process environment: %s\n", go->errbuf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; } if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) { if (printf("Failed to parse command line: %s\n", go->errbuf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; } if (esl_opt_VerifyConfig(go) != eslOK) { if (printf("Failed to parse command line: %s\n", go->errbuf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; } /* help format: */ if (esl_opt_GetBoolean(go, "-h") == TRUE) { esl_banner(stdout, argv[0], banner); esl_usage(stdout, argv[0], usage); if (puts("\nBasic options:") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); esl_opt_DisplayHelp(stdout, go, 1, 2, 80); /* 1= group; 2 = indentation; 120=textwidth*/ if (puts("\nSpecial options:") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); esl_opt_DisplayHelp(stdout, go, 2, 2, 80); /* 2= group; 2 = indentation; 120=textwidth*/ exit(0); } if (esl_opt_ArgNumber(go) != 2) { if (puts("Incorrect number of command line arguments.") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; } if ((*ret_qfile = esl_opt_GetArg(go, 1)) == NULL) { if (puts("Failed to get <qfile> argument on command line") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; } if ((*ret_fmfile = esl_opt_GetArg(go, 2)) == NULL) { if (puts("Failed to get <fmfile> argument on command line") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; } /* Validate any attempted use of stdin streams */ if (esl_strcmp(*ret_fmfile, "-") == 0 && esl_strcmp(*ret_qfile, "-") == 0) { if (puts("Either <fmfile> or <qfile> may be '-' (to read from stdin), but not both.") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; } *ret_go = go; return eslOK; FAILURE: /* all errors handled here are user errors, so be polite. */ esl_usage(stdout, argv[0], usage); puts("\nwhere basic options are:"); esl_opt_DisplayHelp(stdout, go, 1, 2, 80); /* 1= group; 2 = indentation; 80=textwidth*/ printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); esl_getopts_Destroy(go); exit(1); ERROR: if (go) esl_getopts_Destroy(go); exit(status); }
static int output_header(FM_METADATA *meta, FILE *ofp, const ESL_GETOPTS *go, char *fmfile, char *qfile) { char *alph; char *appname = NULL; int status; if (meta->alph_type == fm_DNA) alph = "dna"; else if (meta->alph_type == fm_AMINO) alph = "amino"; if ((status = esl_FileTail(go->argv[0], FALSE, &appname)) != eslOK) return status; if (fprintf(ofp, "# %s :: %s\n", appname, banner) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); if (fprintf(ofp, "# %s\n", EASEL_COPYRIGHT) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); if (fprintf(ofp, "# %s\n", EASEL_LICENSE) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); if (fprintf(ofp, "# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); if (fprintf(ofp, "# input binary-formatted HMMER database: %s\n", fmfile) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (fprintf(ofp, "# input file of query sequences: %s\n", qfile) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (esl_opt_IsUsed(go, "--out")) { char *outfile = esl_opt_GetString(go, "--out"); if (fprintf(ofp, "# output file containing list of hits: %s\n", (esl_strcmp(outfile, "-") == 0 ? "stdout" : outfile)) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); } if (esl_opt_IsUsed(go, "--count_only") && fprintf(ofp, "# output only counts, not hit locations\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (fprintf(ofp, "# alphabet : %s\n", alph) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (fprintf(ofp, "# bin_length : %d\n", meta->freq_cnt_b) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (fprintf(ofp, "# suffix array sample rate: %d\n", meta->freq_SA) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (fprintf(ofp, "# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); if (appname) free(appname); return eslOK; ERROR: if (appname) free(appname); return status; }
static int hit_sorter_by_modelname_aliposition(const void *vh1, const void *vh2) { P7_HIT *h1 = *((P7_HIT **) vh1); /* don't ask. don't change. Don't Panic. */ P7_HIT *h2 = *((P7_HIT **) vh2); int res = esl_strcmp( h1->name, h2->name); if ( res != 0 ) return res; /* first key, seq_idx (unique id for sequences), low to high */ // if on different strand, the positive strand goes first, else use position int dir1 = (h1->dcl[0].ia < h1->dcl[0].ib ? 1 : -1); int dir2 = (h2->dcl[0].ia < h2->dcl[0].ib ? 1 : -1); if (dir1 != dir2) return dir2; // so if dir1 is pos (1), and dir2 is neg (-1), this will return -1, placing h1 before h2; otherwise, vice versa if ( h1->dcl[0].ia == h2->dcl[0].ia) return (h1->dcl[0].ib < h2->dcl[0].ib ? 1 : -1 ); else return (h1->dcl[0].ia > h2->dcl[0].ia ? 1 : -1 ); }
/* Function: main() * Synopsis: Run set of queries against an FM * Purpose: Read in a FM and a file of query sequences. * For each query, find matching FM interval, then collect positions in * the original text T for the corresponding occurrences. These positions * are 0-based (so first character is position 0). */ int main(int argc, char *argv[]) { void* tmp; // used for RALLOC calls clock_t t1, t2; struct tms ts1, ts2; char *fname_fm = NULL; char *fname_queries = NULL; FM_HIT *hits = NULL; char *line = NULL; int status = eslOK; int hit_cnt = 0; int hit_indiv_cnt = 0; int miss_cnt = 0; int hit_num = 0; int hit_num2 = 0; int hits_size = 0; int i,j; int count_only = 0; FM_INTERVAL interval; FM_DATA *fmsf = NULL; FM_DATA *fmsb = NULL; FILE* fp_fm = NULL; FILE* fp = NULL; FILE* out = NULL; char *outname = NULL; ESL_GETOPTS *go = NULL; /* command line processing */ FM_CFG *cfg; FM_METADATA *meta; ESL_SQ *tmpseq; // used for sequence validation ESL_ALPHABET *abc = NULL; //start timer t1 = times(&ts1); process_commandline(argc, argv, &go, &fname_fm, &fname_queries); if (esl_opt_IsOn(go, "--out")) { outname = esl_opt_GetString(go, "--out"); if ( esl_strcmp ("-", outname) == 0 ) { out = stdout; outname = "stdout"; } else { out = fopen(outname,"w"); } } if (esl_opt_IsOn(go, "--count_only")) count_only = 1; if((fp_fm = fopen(fname_fm, "rb")) == NULL) esl_fatal("Cannot open file `%s': ", fname_fm); fm_configAlloc(&cfg); cfg->occCallCnt = 0; meta = cfg->meta; meta->fp = fp_fm; fm_readFMmeta( meta); if (meta->alph_type == fm_DNA) abc = esl_alphabet_Create(eslDNA); else if (meta->alph_type == fm_AMINO) abc = esl_alphabet_Create(eslAMINO); tmpseq = esl_sq_CreateDigital(abc); //read in FM-index blocks ESL_ALLOC(fmsf, meta->block_count * sizeof(FM_DATA) ); if (!meta->fwd_only) ESL_ALLOC(fmsb, meta->block_count * sizeof(FM_DATA) ); for (i=0; i<meta->block_count; i++) { fm_FM_read( fmsf+i,meta, TRUE ); if (!meta->fwd_only) { fm_FM_read(fmsb+i, meta, FALSE ); fmsb[i].SA = fmsf[i].SA; fmsb[i].T = fmsf[i].T; } } fclose(fp_fm); output_header(meta, stdout, go, fname_fm, fname_queries); /* initialize a few global variables, then call initGlobals * to do architecture-specific initialization */ fm_configInit(cfg, NULL); fm_alphabetCreate(meta, NULL); // don't override charBits fp = fopen(fname_queries,"r"); if (fp == NULL) esl_fatal("Unable to open file %s\n", fname_queries); ESL_ALLOC(line, FM_MAX_LINE * sizeof(char)); hits_size = 200; ESL_ALLOC(hits, hits_size * sizeof(FM_HIT)); while(fgets(line, FM_MAX_LINE, fp) ) { int qlen=0; while (line[qlen] != '\0' && line[qlen] != '\n') qlen++; if (line[qlen] == '\n') line[qlen] = '\0'; hit_num = 0; for (i=0; i<meta->block_count; i++) { fm_getSARangeReverse(fmsf+i, cfg, line, meta->inv_alph, &interval); if (interval.lower>=0 && interval.lower <= interval.upper) { int new_hit_num = interval.upper - interval.lower + 1; hit_num += new_hit_num; if (!count_only) { if (hit_num > hits_size) { hits_size = 2*hit_num; ESL_RALLOC(hits, tmp, hits_size * sizeof(FM_HIT)); } getFMHits(fmsf+i, cfg, &interval, i, hit_num-new_hit_num, qlen, hits, fm_forward); } } /* find reverse hits, using backward search on the forward FM*/ if (!meta->fwd_only) { fm_getSARangeForward(fmsb+i, cfg, line, meta->inv_alph, &interval);// yes, use the backward fm to produce the equivalent of a forward search on the forward fm if (interval.lower>=0 && interval.lower <= interval.upper) { int new_hit_num = interval.upper - interval.lower + 1; hit_num += new_hit_num; if (!count_only) { if (hit_num > hits_size) { hits_size = 2*hit_num; ESL_RALLOC(hits, tmp, hits_size * sizeof(FM_HIT)); } //even though I used fmsb above, use fmsf here, since we'll now do a backward trace //in the FM-index to find the next sampled SA position getFMHits(fmsf+i, cfg, &interval, i, hit_num-new_hit_num, qlen, hits, fm_backward); } } } } if (hit_num > 0) { if (count_only) { hit_cnt++; hit_indiv_cnt += hit_num; } else { hit_num2 = 0; //for each hit, identify the sequence id and position within that sequence for (i = 0; i< hit_num; i++) { status = fm_getOriginalPosition (fmsf, meta, hits[i].block, hits[i].length, fm_forward, hits[i].start, &(hits[i].block), &(hits[i].start) ); hits[i].sortkey = (status==eslERANGE ? -1 : meta->seq_data[ hits[i].block ].target_id); //validate match - if any characters in orig sequence were ambiguities, reject fm_convertRange2DSQ( fmsf, meta, hits[i].start, hits[i].length, p7_NOCOMPLEMENT, tmpseq, TRUE ); for (j=1; j<=hits[i].length; j++) { if (tmpseq->dsq[j] >= abc->K) { hits[i].sortkey = -1; //reject j = hits[i].length+1; //quit looking } } if (hits[i].sortkey != -1) hit_num2++; // legitimate hit } if (hit_num2 > 0) hit_cnt++; //now sort according the the sequence_id corresponding to that seq_offset qsort(hits, hit_num, sizeof(FM_HIT), hit_sorter); //skim past the skipped entries i = 0; while ( i < hit_num ) { if (hits[i].sortkey != -1 ) break; // i++; } if (i < hit_num) { if (out != NULL) { fprintf (out, "%s\n",line); //fprintf (out, "\t%10s (%8d %s)\n",meta->seq_data[ hits[i].block ].name, hits[i].start, (hits[i].direction==fm_forward?"+":"-")); fprintf (out, " %8ld %s %10s\n", (long)(hits[i].start), (hits[i].direction==fm_forward?"f":"r"), meta->seq_data[ hits[i].block ].name); } hit_indiv_cnt++; i++; // skip the first one, since I'll be comparing each to the previous for ( ; i< hit_num; i++) { if ( //meta->seq_data[ hits[i].block ].id != meta->seq_data[ hits[i-1].block ].id || hits[i].sortkey != hits[i-1].sortkey || //sortkey is seq_data[].id hits[i].direction != hits[i-1].direction || hits[i].start != hits[i-1].start ) { if (out != NULL) //fprintf (out, "\t%10s (%8d %s)\n",meta->seq_data[ hits[i].block ].name, hits[i].start, (hits[i].direction==fm_forward?"+":"-")); fprintf (out, " %8ld %s %10s\n", (long)(hits[i].start), (hits[i].direction==fm_forward?"f":"r"), meta->seq_data[ hits[i].block ].name); hit_indiv_cnt++; } } if (out != NULL) fprintf (out, "\n"); } } } else { miss_cnt++; } } for (i=0; i<meta->block_count; i++) { fm_FM_destroy( fmsf+i, 1 ); if (!meta->fwd_only) fm_FM_destroy( fmsb+i, 0 ); } free (hits); free (line); fclose(fp); fm_configDestroy(cfg); // compute and print the elapsed time in millisec t2 = times(&ts2); { double clk_ticks = sysconf(_SC_CLK_TCK); double elapsedTime = (t2-t1)/clk_ticks; double throughput = cfg->occCallCnt/elapsedTime; fprintf (stderr, "hit: %-10d (%d)\n", hit_cnt, hit_indiv_cnt); fprintf (stderr, "miss:%-10d\n", miss_cnt); fprintf (stderr, "run time: %.2f seconds\n", elapsedTime); fprintf (stderr, "occ calls: %12s\n", commaprint(cfg->occCallCnt)); fprintf (stderr, "occ/sec: %12s\n", commaprint(throughput)); } exit(eslOK); ERROR: printf ("failure allocating memory for hits\n"); exit(status); }
/* Function: p7_tophits_RemoveDuplicates() * Synopsis: Remove overlapping hits. * * Purpose: After nhmmer pipeline has completed, the TopHits object may * contain duplicates if the target was broken into overlapping * windows. Scan through, and remove duplicates. Since the * duplicates may be incomplete (one sequence is a partial * hit because it's window didn't cover the full length of * the hit; or perhaps it's full-length, but with slightly * different scoring due to the impact of sequence * composition on score model), keep the one with better * p-value. * * Returns: <eslOK> on success. */ int p7_tophits_RemoveDuplicates(P7_TOPHITS *th, int using_bit_cutoffs) { int i; /* counter over hits */ int j; /* previous un-duplicated hit */ int s_i, s_j, e_i, e_j, dir_i, dir_j, len_i, len_j; int intersect_alistart, intersect_aliend, intersect_alilen; int intersect_hmmstart, intersect_hmmend, intersect_hmmlen; //int64_t sub_i, sub_j; int tmp; double p_i, p_j; int remove; if (th->N<2) return eslOK; j=0; for (i = 1; i < th->N; i++) { //sub_j = th->hit[j]->subseq_start; p_j = th->hit[j]->lnP; s_j = th->hit[j]->dcl[0].ia; e_j = th->hit[j]->dcl[0].ib; dir_j = (s_j < e_j ? 1 : -1); if (dir_j == -1) { tmp = s_j; s_j = e_j; e_j = tmp; } len_j = e_j - s_j + 1 ; //sub_i = th->hit[i]->subseq_start; p_i = th->hit[i]->lnP; s_i = th->hit[i]->dcl[0].ia; e_i = th->hit[i]->dcl[0].ib; dir_i = (s_i < e_i ? 1 : -1); if (dir_i == -1) { tmp = s_i; s_i = e_i; e_i = tmp; } len_i = e_i - s_i + 1 ; // these will only matter if seqidx and strand are the same intersect_alistart = s_i>s_j ? s_i : s_j; intersect_aliend = e_i<e_j ? e_i : e_j; intersect_alilen = intersect_aliend - intersect_alistart + 1; intersect_hmmstart = (th->hit[i]->dcl[0].ad->hmmfrom > th->hit[j]->dcl[0].ad->hmmfrom) ? th->hit[i]->dcl[0].ad->hmmfrom : th->hit[j]->dcl[0].ad->hmmfrom; intersect_hmmend = (th->hit[i]->dcl[0].ad->hmmto < th->hit[j]->dcl[0].ad->hmmto) ? th->hit[i]->dcl[0].ad->hmmto : th->hit[j]->dcl[0].ad->hmmto; intersect_hmmlen = intersect_hmmend - intersect_hmmstart + 1; if ( esl_strcmp(th->hit[i]->name, th->hit[i-1]->name) == 0 && //same model th->hit[i]->seqidx == th->hit[i-1]->seqidx && //same source sequence dir_i == dir_j && // only bother removing if the overlapping hits are on the same strand intersect_hmmlen > 0 && //only if they're both hitting similar parts of the model ( ( s_i >= s_j-3 && s_i <= s_j+3) || // at least one side is essentially flush ( e_i >= e_j-3 && e_i <= e_j+3) || ( intersect_alilen >= len_i * 0.95) || // or one of the hits covers >90% of the other ( intersect_alilen >= len_j * 0.95) ) ) { /* Force one to go unreported. We choose to keep the one with the * better e-value. This addresses two issues * (1) longer hits sometimes encounter higher bias corrections, * leading to lower scores; seems better to focus on the * high-scoring heart of the alignment, if we have a * choice * (2) it is possible that a lower-scoring longer hit (see #1) * that is close to threshold will pass the pipeline in * one condition and not the other (e.g. --toponly, or * single vs multi threaded), and if longer hits obscure * shorter higher-scoring ones, a shorter "hit" might be * lost by being obscured by a longer one that is subsequently * removed due to insufficient score. * see late notes in ~wheelert/notebook/2012/0518-dfam-scripts/00NOTES */ //remove = 0; // 1 := keep i, 0 := keep i-1 remove = p_i < p_j ? j : i; th->hit[remove]->flags |= p7_IS_DUPLICATE; if (using_bit_cutoffs) { //report/include flags were already included, need to remove them here th->hit[remove]->flags &= ~p7_IS_REPORTED; th->hit[remove]->flags &= ~p7_IS_INCLUDED; } j = remove == j ? i : j; } else { j = i; } } return eslOK; }
/* Function: main() * Synopsis: break input sequence set into chunks, for each one building the * Burrows-Wheeler transform and corresponding FM-index. Maintain requisite * meta data. * Notes: Currently depends on the divsufsort-lite code of Yuta Mori, though this * could easily be replaced. */ int main(int argc, char **argv) { char tmp_filename[16] = "fmtmpXXXXXX"; FILE *fptmp = NULL; FILE *fp = NULL; uint8_t *T = NULL; uint8_t *BWT = NULL; int *SA = NULL; //what I write will be 32-bit ints, but I need to keep this as int so it'll work with libdivsufsort uint32_t *SAsamp = NULL; uint32_t *occCnts_sb = NULL; // same indexing as above uint32_t *cnts_sb = NULL; uint16_t *occCnts_b = NULL; // this is logically a 2D array, but will be indexed as occ_cnts[alph_size*index + char] (instead of occ_cnts[index][char]) uint16_t *cnts_b = NULL; FM_METADATA *meta = NULL; clock_t t1, t2; struct tms ts1, ts2; long i,j,c; int status = eslOK; int chars_per_byte; int num_freq_cnts_sb ; int num_freq_cnts_b ; int num_SA_samples ; int infmt = eslSQFILE_UNKNOWN; int alphatype = eslUNKNOWN; int alphaguess =eslUNKNOWN; ESL_ALPHABET *abc = NULL; ESL_SQ *sq = NULL; ESL_SQFILE *sqfp = NULL; ESL_SQ *tmpsq = NULL; ESL_SQ_BLOCK *block = NULL; char *fname_in = NULL; char *fname_out= NULL; int block_size = 50000000; int sq_cnt = 0; int use_tmpsq = 0; uint64_t block_length; uint64_t total_char_count = 0; int max_block_size; int numblocks = 0; uint32_t numseqs = 0; int allocedseqs = 1000; uint32_t seq_offset = 0; uint32_t ambig_offset = 0; uint32_t overlap = 0; uint16_t seq_cnt; uint16_t ambig_cnt; uint32_t prev_numseqs = 0; int compressed_bytes; uint32_t term_loc; ESL_GETOPTS *go = NULL; /* command line processing */ uint8_t ambig_repl = 0; int in_ambig_run = 0; FM_AMBIGLIST ambig_list; ESL_ALLOC (meta, sizeof(FM_METADATA)); if (meta == NULL) esl_fatal("unable to allocate memory to store FM meta data\n"); ESL_ALLOC (meta->ambig_list, sizeof(FM_AMBIGLIST)); if (meta->ambig_list == NULL) esl_fatal("unable to allocate memory to store FM ambiguity data\n"); fm_initAmbiguityList(meta->ambig_list); meta->alph_type = fm_DNA; meta->freq_SA = 8; meta->freq_cnt_b = 256; meta->freq_cnt_sb = pow(2,16); //65536 - that's the # values in a short meta->seq_count = 0; ESL_ALLOC (meta->seq_data, allocedseqs * sizeof(FM_SEQDATA)); if (meta->seq_data == NULL ) esl_fatal("unable to allocate memory to store FM sequence data\n"); process_commandline(argc, argv, &go, &fname_in, &fname_out); if (esl_opt_IsOn(go, "--bin_length")) meta->freq_cnt_b = esl_opt_GetInteger(go, "--bin_length"); if ( meta->freq_cnt_b < 32 || meta->freq_cnt_b >4096 || (meta->freq_cnt_b & (meta->freq_cnt_b - 1)) ) // test power of 2 esl_fatal("bin_length must be a power of 2, at least 128, and at most 4096\n"); if (esl_opt_IsOn(go, "--sa_freq")) meta->freq_SA = esl_opt_GetInteger(go, "--sa_freq"); if ( (meta->freq_SA & (meta->freq_SA - 1)) ) // test power of 2 esl_fatal ("SA_freq must be a power of 2\n"); if (esl_opt_IsOn(go, "--block_size")) block_size = 1000000 * esl_opt_GetInteger(go, "--block_size"); if ( block_size <=0 ) esl_fatal ("block_size must be a positive number\n"); //start timer t1 = times(&ts1); output_header(stdout, go, fname_in, fname_out); if (esl_opt_GetString(go, "--informat") != NULL) { infmt = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--informat")); if (infmt == eslSQFILE_UNKNOWN) esl_fatal("%s is not a valid input sequence file format for --informat"); } status = esl_sqfile_Open(fname_in, infmt, NULL, &sqfp); if (status == eslENOTFOUND) esl_fatal("No such file %s", fname_in); else if (status == eslEFORMAT) esl_fatal("Format of seqfile %s unrecognized.", fname_in); else if (status != eslOK) esl_fatal("Open failed, code %d.", status); meta->fwd_only = 0; if (esl_opt_IsUsed(go, "--alph")) { meta->alph = esl_opt_GetString(go, "--alph") ; if ( esl_strcmp(meta->alph, "dna")==0 || esl_strcmp(meta->alph, "rna")==0) { meta->alph_type = fm_DNA; alphatype = eslDNA; } else if (esl_strcmp(meta->alph, "dna_full")==0 || esl_strcmp(meta->alph, "rna_full")==0) { meta->alph_type = fm_DNA_full; alphatype = eslDNA; } else if (esl_strcmp(meta->alph, "amino")==0) { meta->alph_type = fm_AMINO; alphatype = eslAMINO; meta->fwd_only = 1; } else { esl_fatal("Unknown alphabet type. Try 'dna', 'dna_full', or 'amino'\n%s", ""); } } else { esl_sqfile_GuessAlphabet(sqfp, &alphaguess); if (alphaguess == eslDNA || alphaguess == eslRNA) { meta->alph_type = fm_DNA; alphatype = eslDNA; } else if (alphaguess == eslAMINO) { meta->alph_type = fm_AMINO; alphatype = eslAMINO; meta->fwd_only = 1; } else { esl_fatal("Unknown alphabet type. Try 'dna', 'dna_full', or 'amino'\n%s", ""); } } if (esl_opt_IsOn(go, "--fwd_only") ) meta->fwd_only = 1; meta->alph = NULL; //getInverseAlphabet fm_alphabetCreate(meta, &(meta->charBits)); chars_per_byte = 8/meta->charBits; //shift inv_alph up one, to make space for '$' at 0 for (i=0; i<256; i++) if ( meta->inv_alph[i] >= 0) meta->inv_alph[i]++; abc = esl_alphabet_Create(alphatype); sq = esl_sq_CreateDigital(abc); tmpsq = esl_sq_CreateDigital(abc); esl_sqfile_SetDigital(sqfp, abc); block = esl_sq_CreateDigitalBlock(FM_BLOCK_COUNT, abc); block->complete = FALSE; // max_block_size = FM_BLOCK_OVERLAP+block_size+1 + block_size*.2; // +1 for the '$' max_block_size = FM_BLOCK_OVERLAP+block_size+1 + block_size; // temporary hack to avoid memory over-runs (see end of 1101_fmindex_benchmarking/00NOTES) if (alphatype == fm_DNA) fm_initAmbiguityList(&ambig_list); /* Allocate BWT, Text, SA, and FM-index data structures, allowing storage of maximally large sequence*/ ESL_ALLOC (T, max_block_size * sizeof(uint8_t)); ESL_ALLOC (BWT, max_block_size * sizeof(uint8_t)); ESL_ALLOC (SA, max_block_size * sizeof(int)); ESL_ALLOC (SAsamp, (1+floor((double)max_block_size/meta->freq_SA) ) * sizeof(uint32_t)); ESL_ALLOC (occCnts_sb, (1+ceil((double)max_block_size/meta->freq_cnt_sb)) * meta->alph_size * sizeof(uint32_t)); // every freq_cnt_sb positions, store an array of ints ESL_ALLOC (cnts_sb, meta->alph_size * sizeof(uint32_t)); ESL_ALLOC (occCnts_b, ( 1+ceil((double)max_block_size/meta->freq_cnt_b)) * meta->alph_size * sizeof(uint16_t)); // every freq_cnt_b positions, store an array of 8-byte ints ESL_ALLOC (cnts_b, meta->alph_size * sizeof(uint16_t)); if((T == NULL) || (BWT == NULL) || (SA==NULL) || (SAsamp==NULL) || (BWT==NULL) || (cnts_b==NULL) || (occCnts_b==NULL) || (cnts_sb==NULL) || (occCnts_sb==NULL) ) { esl_fatal( "%s: Cannot allocate memory.\n", argv[0]); } // Open a temporary file, to which FM-index data will be written if (esl_tmpfile(tmp_filename, &fptmp) != eslOK) esl_fatal("unable to open fm-index tmpfile"); /* Main loop: */ while (status == eslOK ) { //reset block as an empty vessel for (i=0; i<block->count; i++) esl_sq_Reuse(block->list + i); if (use_tmpsq) { esl_sq_Copy(tmpsq , block->list); block->complete = FALSE; //this lets ReadBlock know that it needs to append to a small bit of previously-read seqeunce block->list->C = FM_BLOCK_OVERLAP; // overload the ->C value, which ReadBlock uses to determine how much // overlap should be retained in the ReadWindow step } else { block->complete = TRUE; } status = esl_sqio_ReadBlock(sqfp, block, block_size, -1, alphatype != eslAMINO); if (status == eslEOF) continue; if (status != eslOK) ESL_XEXCEPTION(status, "failure reading sequence block"); seq_offset = numseqs; ambig_offset = meta->ambig_list->count; if (block->complete || block->count == 0) { use_tmpsq = FALSE; } else { /* The final sequence on the block was a probably-incomplete window of the active sequence. * Grab a copy of the end for use in the next pass, to ensure we don't miss hits crossing * the boundary between two blocks. */ esl_sq_Copy(block->list + (block->count - 1) , tmpsq); use_tmpsq = TRUE; } block->first_seqidx = sq_cnt; sq_cnt += block->count - (use_tmpsq ? 1 : 0);// if there's an incomplete sequence read into the block wait to count it until it's complete. /* Read dseqs from block into text element T. * Convert the dsq from esl-alphabet to fm-alphabet (1..k for alphabet of size k). * (a) collapsing upper/lower case for appropriate sorting. * (b) reserving 0 for '$', which must be lexicographically smallest * (these will later be shifted to 0-based alphabet, once SA has been built) * */ block_length = 0; for (i=0; i<block->count; i++) { //start a new block, with space for the name allocateSeqdata(meta, block->list+i, numseqs, &allocedseqs); //meta data meta->seq_data[numseqs].target_id = block->first_seqidx + i ; meta->seq_data[numseqs].target_start = block->list[i].start; meta->seq_data[numseqs].fm_start = block_length; if (block->list[i].name == NULL) meta->seq_data[numseqs].name[0] = '\0'; else strcpy(meta->seq_data[numseqs].name, block->list[i].name ); if (block->list[i].acc == NULL) meta->seq_data[numseqs].acc[0] = '\0'; else strcpy(meta->seq_data[numseqs].acc, block->list[i].acc ); if (block->list[i].source == NULL) meta->seq_data[numseqs].source[0] = '\0'; else strcpy(meta->seq_data[numseqs].source, block->list[i].source ); if (block->list[i].desc == NULL) meta->seq_data[numseqs].desc[0] = '\0'; else strcpy(meta->seq_data[numseqs].desc, block->list[i].desc ); for (j=1; j<=block->list[i].n; j++) { c = abc->sym[block->list[i].dsq[j]]; if ( meta->alph_type == fm_DNA) { if (meta->inv_alph[c] == -1) { // replace ambiguity characters by rotating through A,C,G, and T. c = meta->alph[ambig_repl]; ambig_repl = (ambig_repl+1)%4; if (!in_ambig_run) { fm_addAmbiguityRange(meta->ambig_list, block_length, block_length); in_ambig_run=1; } else { meta->ambig_list->ranges[meta->ambig_list->count - 1].upper = block_length; } } else { in_ambig_run=0; } } else if (meta->inv_alph[c] == -1) { esl_fatal("requested alphabet doesn't match input text\n"); } T[block_length] = meta->inv_alph[c]; block_length++; if (j>block->list[i].C) total_char_count++; // add to total count, only if it's not redundant with earlier read meta->seq_data[numseqs].length++; } numseqs++; } T[block_length] = 0; // last character 0 is effectively '$' for suffix array block_length++; seq_cnt = numseqs-seq_offset; ambig_cnt = meta->ambig_list->count - ambig_offset; //build and write FM-index for T. This will be a BWT on the reverse of the sequence, required for reverse-traversal of the BWT buildAndWriteFMIndex(meta, seq_offset, ambig_offset, seq_cnt, ambig_cnt, (uint32_t)block->list[0].C, T, BWT, SA, SAsamp, occCnts_sb, cnts_sb, occCnts_b, cnts_b, block_length, fptmp); if ( ! meta->fwd_only ) { //build and write FM-index for un-reversed T (used to find reverse hits using forward traversal of the BWT buildAndWriteFMIndex(meta, seq_offset, ambig_offset, seq_cnt, ambig_cnt, 0, T, BWT, SA, NULL, occCnts_sb, cnts_sb, occCnts_b, cnts_b, block_length, fptmp); } prev_numseqs = numseqs; numblocks++; } esl_sqfile_Close(sqfp); esl_alphabet_Destroy(abc); esl_sq_Destroy(sq); esl_sq_Destroy(tmpsq); esl_sq_DestroyBlock(block); meta->seq_count = numseqs; meta->block_count = numblocks; /* Finished writing the FM-index data to a temporary file. Now write * metadata to fname_out, than append FM-index data from temp file */ if((fp = fopen(fname_out, "wb")) == NULL) esl_fatal( "%s: Cannot open file `%s': ", argv[0], fname_out); //write out meta data if( fwrite(&(meta->fwd_only), sizeof(meta->fwd_only), 1, fp) != 1 || fwrite(&(meta->alph_type), sizeof(meta->alph_type), 1, fp) != 1 || fwrite(&(meta->alph_size), sizeof(meta->alph_size), 1, fp) != 1 || fwrite(&(meta->charBits), sizeof(meta->charBits), 1, fp) != 1 || fwrite(&(meta->freq_SA), sizeof(meta->freq_SA), 1, fp) != 1 || fwrite(&(meta->freq_cnt_sb), sizeof(meta->freq_cnt_sb), 1, fp) != 1 || fwrite(&(meta->freq_cnt_b), sizeof(meta->freq_cnt_b), 1, fp) != 1 || fwrite(&(meta->block_count), sizeof(meta->block_count), 1, fp) != 1 || fwrite(&(meta->seq_count), sizeof(meta->seq_count), 1, fp) != 1 || fwrite(&(meta->ambig_list->count), sizeof(meta->ambig_list->count), 1, fp) != 1 || fwrite(&total_char_count, sizeof(total_char_count), 1, fp) != 1 ) esl_fatal( "%s: Error writing meta data for FM index.\n", argv[0]); for (i=0; i<meta->seq_count; i++) { if( fwrite(&(meta->seq_data[i].target_id), sizeof(meta->seq_data[i].target_id), 1, fp) != 1 || fwrite(&(meta->seq_data[i].target_start), sizeof(meta->seq_data[i].target_start), 1, fp) != 1 || fwrite(&(meta->seq_data[i].fm_start), sizeof(meta->seq_data[i].fm_start), 1, fp) != 1 || fwrite(&(meta->seq_data[i].length), sizeof(meta->seq_data[i].length), 1, fp) != 1 || fwrite(&(meta->seq_data[i].name_length), sizeof(meta->seq_data[i].name_length), 1, fp) != 1 || fwrite(&(meta->seq_data[i].acc_length), sizeof(meta->seq_data[i].acc_length), 1, fp) != 1 || fwrite(&(meta->seq_data[i].source_length),sizeof(meta->seq_data[i].source_length), 1, fp) != 1 || fwrite(&(meta->seq_data[i].desc_length), sizeof(meta->seq_data[i].desc_length), 1, fp) != 1 || fwrite(meta->seq_data[i].name, sizeof(char), meta->seq_data[i].name_length+1 , fp) != meta->seq_data[i].name_length+1 || fwrite(meta->seq_data[i].acc, sizeof(char), meta->seq_data[i].acc_length+1 , fp) != meta->seq_data[i].acc_length+1 || fwrite(meta->seq_data[i].source, sizeof(char), meta->seq_data[i].source_length+1, fp) != meta->seq_data[i].source_length+1 || fwrite(meta->seq_data[i].desc, sizeof(char), meta->seq_data[i].desc_length+1 , fp) != meta->seq_data[i].desc_length+1 ) esl_fatal( "%s: Error writing meta data for FM index.\n", argv[0]); } for (i=0; i<meta->ambig_list->count; i++) { if( fwrite(&(meta->ambig_list->ranges[i].lower), sizeof(meta->ambig_list->ranges[i].lower), 1, fp) != 1 || fwrite(&(meta->ambig_list->ranges[i].upper), sizeof(meta->ambig_list->ranges[i].upper), 1, fp) != 1 ) esl_fatal( "%s: Error writing ambiguity data for FM index.\n", argv[0]); } /* now append the FM-index data in fptmp to the desired output file, fp */ rewind(fptmp); for (i=0; i<numblocks; i++) { for(j=0; j< (meta->fwd_only?1:2); j++ ) { //do this once or twice, once for forward-T index, and possibly once for reversed //first, read if(fread(&block_length, sizeof(block_length), 1, fptmp) != 1) esl_fatal( "%s: Error reading block_length in FM index.\n", argv[0]); if(fread(&term_loc, sizeof(term_loc), 1, fptmp) != 1) esl_fatal( "%s: Error reading terminal location in FM index.\n", argv[0]); if(fread(&seq_offset, sizeof(seq_offset), 1, fptmp) != 1) esl_fatal( "%s: Error reading seq_offset in FM index.\n", argv[0]); if(fread(&ambig_offset, sizeof(ambig_offset ), 1, fptmp) != 1) esl_fatal( "%s: Error reading ambig_offset in FM index.\n", argv[0]); if(fread(&overlap, sizeof(overlap), 1, fptmp) != 1) esl_fatal( "%s: Error reading overlap in FM index.\n", argv[0]); if(fread(&seq_cnt, sizeof(seq_cnt), 1, fptmp) != 1) esl_fatal( "%s: Error reading seq_cnt in FM index.\n", argv[0]); if(fread(&ambig_cnt, sizeof(ambig_cnt), 1, fptmp) != 1) esl_fatal( "%s: Error reading ambig_cnt in FM index.\n", argv[0]); compressed_bytes = ((chars_per_byte-1+block_length)/chars_per_byte); num_freq_cnts_b = 1+ceil((double)block_length/meta->freq_cnt_b); num_freq_cnts_sb = 1+ceil((double)block_length/meta->freq_cnt_sb); num_SA_samples = 1+floor((double)block_length/meta->freq_SA); //j==0 test cause T and SA to be written only for forward sequence if(j==0 && fread(T, sizeof(uint8_t), compressed_bytes, fptmp) != compressed_bytes) esl_fatal( "%s: Error reading T in FM index.\n", argv[0]); if(fread(BWT, sizeof(uint8_t), compressed_bytes, fptmp) != compressed_bytes) esl_fatal( "%s: Error reading BWT in FM index.\n", argv[0]); if(j==0 && fread(SAsamp, sizeof(uint32_t), (size_t)num_SA_samples, fptmp) != (size_t)num_SA_samples) esl_fatal( "%s: Error reading SA in FM index.\n", argv[0]); if(fread(occCnts_b, sizeof(uint16_t)*(meta->alph_size), (size_t)num_freq_cnts_b, fptmp) != (size_t)num_freq_cnts_b) esl_fatal( "%s: Error reading occCnts_b in FM index.\n", argv[0]); if(fread(occCnts_sb, sizeof(uint32_t)*(meta->alph_size), (size_t)num_freq_cnts_sb, fptmp) != (size_t)num_freq_cnts_sb) esl_fatal( "%s: Error reading occCnts_sb in FM index.\n", argv[0]); //then, write if(fwrite(&block_length, sizeof(block_length), 1, fp) != 1) esl_fatal( "%s: Error writing block_length in FM index.\n", argv[0]); if(fwrite(&term_loc, sizeof(term_loc), 1, fp) != 1) esl_fatal( "%s: Error writing terminal location in FM index.\n", argv[0]); if(fwrite(&seq_offset, sizeof(seq_offset), 1, fp) != 1) esl_fatal( "%s: Error writing seq_offset in FM index.\n", argv[0]); if(fwrite(&ambig_offset, sizeof(ambig_offset), 1, fp) != 1) esl_fatal( "%s: Error writing ambig_offset in FM index.\n", argv[0]); if(fwrite(&overlap, sizeof(overlap), 1, fp) != 1) esl_fatal( "%s: Error writing overlap in FM index.\n", argv[0]); if(fwrite(&seq_cnt, sizeof(seq_cnt), 1, fp) != 1) esl_fatal( "%s: Error writing seq_cnt in FM index.\n", argv[0]); if(fwrite(&ambig_cnt, sizeof(ambig_cnt), 1, fp) != 1) esl_fatal( "%s: Error writing ambig_cnt in FM index.\n", argv[0]); if(j==0 && fwrite(T, sizeof(uint8_t), compressed_bytes, fp) != compressed_bytes) esl_fatal( "%s: Error writing T in FM index.\n", argv[0]); if(fwrite(BWT, sizeof(uint8_t), compressed_bytes, fp) != compressed_bytes) esl_fatal( "%s: Error writing BWT in FM index.\n", argv[0]); if(j==0 && fwrite(SAsamp, sizeof(uint32_t), (size_t)num_SA_samples, fp) != (size_t)num_SA_samples) esl_fatal( "%s: Error writing SA in FM index.\n", argv[0]); if(fwrite(occCnts_b, sizeof(uint16_t)*(meta->alph_size), (size_t)num_freq_cnts_b, fp) != (size_t)num_freq_cnts_b) esl_fatal( "%s: Error writing occCnts_b in FM index.\n", argv[0]); if(fwrite(occCnts_sb, sizeof(uint32_t)*(meta->alph_size), (size_t)num_freq_cnts_sb, fp) != (size_t)num_freq_cnts_sb) esl_fatal( "%s: Error writing occCnts_sb in FM index.\n", argv[0]); } } fprintf (stderr, "Number of characters in index: %ld\n", (long)total_char_count); fprintf (stderr, "Number of FM-index blocks: %ld\n", (long)meta->block_count); fclose(fp); fclose(fptmp); free(T); free(BWT); free(SA); free(SAsamp); free(occCnts_b); free(cnts_b); free(occCnts_sb); free(cnts_sb); fm_metaDestroy(meta); esl_getopts_Destroy(go); // compute and print the elapsed time in millisec t2 = times(&ts2); { double clk_ticks = sysconf(_SC_CLK_TCK); double elapsedTime = (t2-t1)/clk_ticks; fprintf (stderr, "run time: %.2f seconds\n", elapsedTime); } return (eslOK); ERROR: /* Deallocate memory. */ if (fp) fclose(fp); if (T) free(T); if (BWT) free(BWT); if (SA) free(SA); if (SAsamp) free(SAsamp); if (occCnts_b) free(occCnts_b); if (cnts_b) free(cnts_b); if (occCnts_sb) free(occCnts_sb); if (cnts_sb) free(cnts_sb); if (ambig_list.ranges) free(ambig_list.ranges); fm_metaDestroy(meta); esl_getopts_Destroy(go); esl_sqfile_Close(sqfp); esl_alphabet_Destroy(abc); esl_sq_Destroy(sq); if (tmpsq) esl_sq_Destroy(tmpsq); if (block) esl_sq_DestroyBlock(block); fprintf (stderr, "failure during memory allocation\n"); exit(status); }