/* serial_master() * The serial version of hmmsearch. * For each query HMM in <hmmfile> search the database for hits. * * A master can only return if it's successful. All errors are handled * immediately and fatally with p7_Fail(). We also use the * ESL_EXCEPTION and ERROR: mechanisms, but only because we know we're * using a fatal exception handler. */ static int serial_master(ESL_GETOPTS *go, struct cfg_s *cfg) { FILE *ofp = stdout; /* results output file (-o) */ P7_HMMFILE *hfp = NULL; /* open input HMM file */ ESL_SQFILE *dbfp = NULL; /* open input sequence file */ P7_HMM *hmm = NULL; /* one HMM query */ ESL_ALPHABET *abc = NULL; /* digital alphabet */ int dbfmt = eslSQFILE_UNKNOWN; /* format code for sequence database file */ ESL_STOPWATCH *w; int textw = 0; int nquery = 0; int status = eslOK; int hstatus = eslOK; int sstatus = eslOK; int i; int ncpus = 0; int infocnt = 0; WORKER_INFO *info = NULL; char errbuf[eslERRBUFSIZE]; w = esl_stopwatch_Create(); if (esl_opt_GetBoolean(go, "--notextw")) textw = 0; else textw = esl_opt_GetInteger(go, "--textw"); if (esl_opt_IsOn(go, "--tformat")) { dbfmt = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--tformat")); if (dbfmt == eslSQFILE_UNKNOWN) p7_Fail("%s is not a recognized sequence database file format\n", esl_opt_GetString(go, "--tformat")); } /* Open the target sequence database */ status = esl_sqfile_Open(cfg->dbfile, dbfmt, p7_SEQDBENV, &dbfp); if (status == eslENOTFOUND) p7_Fail("Failed to open sequence file %s for reading\n", cfg->dbfile); else if (status == eslEFORMAT) p7_Fail("Sequence file %s is empty or misformatted\n", cfg->dbfile); else if (status == eslEINVAL) p7_Fail("Can't autodetect format of a stdin or .gz seqfile"); else if (status != eslOK) p7_Fail("Unexpected error %d opening sequence file %s\n", status, cfg->dbfile); if (esl_opt_IsUsed(go, "--restrictdb_stkey") || esl_opt_IsUsed(go, "--restrictdb_n")) { if (esl_opt_IsUsed(go, "--ssifile")) esl_sqfile_OpenSSI(dbfp, esl_opt_GetString(go, "--ssifile")); else esl_sqfile_OpenSSI(dbfp, NULL); } /* Open the query profile HMM file */ status = p7_hmmfile_OpenE(cfg->hmmfile, NULL, &hfp, errbuf); if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", cfg->hmmfile, errbuf); else if (status == eslEFORMAT) p7_Fail("File format problem in trying to open HMM file %s.\n%s\n", cfg->hmmfile, errbuf); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, cfg->hmmfile, errbuf); /* Open the results output files */ if (esl_opt_IsOn(go, "-o")) { if ((ofp = fopen(esl_opt_GetString(go, "-o"), "w+")) == NULL) p7_Fail("Failed to open output file %s for writing\n", esl_opt_GetString(go, "-o")); } infocnt = 1; ESL_ALLOC(info, sizeof(*info) * infocnt); /* <abc> is not known 'til first HMM is read. */ hstatus = p7_hmmfile_Read(hfp, &abc, &hmm); if (hstatus == eslOK) { /* One-time initializations after alphabet <abc> becomes known */ // output_header(ofp, go, cfg->hmmfile, cfg->dbfile); // dbfp->abc = abc; //ReadBlock requires knowledge of the alphabet to decide how best to read blocks // for (i = 0; i < infocnt; ++i) // { // info[i].bg = p7_bg_Create(abc); // } } /* Outer loop: over each query HMM in <hmmfile>. */ // while (hstatus == eslOK) // { P7_PROFILE *gm = NULL; P7_OPROFILE *om = NULL; /* optimized query profile */ nquery++; esl_stopwatch_Start(w); /* seqfile may need to be rewound (multiquery mode) */ if (nquery > 1) { if (! esl_sqfile_IsRewindable(dbfp)) esl_fatal("Target sequence file %s isn't rewindable; can't search it with multiple queries", cfg->dbfile); if (! esl_opt_IsUsed(go, "--restrictdb_stkey") ) esl_sqfile_Position(dbfp, 0); //only re-set current position to 0 if we're not planning to set it in a moment } if ( cfg->firstseq_key != NULL ) { //it's tempting to want to do this once and capture the offset position for future passes, but ncbi files make this non-trivial, so this keeps it general sstatus = esl_sqfile_PositionByKey(dbfp, cfg->firstseq_key); if (sstatus != eslOK) p7_Fail("Failure setting restrictdb_stkey to %d\n", cfg->firstseq_key); } // if (fprintf(ofp, "Query: %s [M=%d]\n", hmm->name, hmm->M) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); // if (hmm->acc) { if (fprintf(ofp, "Accession: %s\n", hmm->acc) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); } // if (hmm->desc) { if (fprintf(ofp, "Description: %s\n", hmm->desc) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); } /* Convert to an optimized model */ gm = p7_profile_Create (hmm->M, abc); om = p7_oprofile_Create(hmm->M, abc); // p7_ProfileConfig(hmm, info->bg, gm, 100, p7_LOCAL); /* 100 is a dummy length for now; and MSVFilter requires local mode */ p7_oprofile_Convert(gm, om); /* <om> is now p7_LOCAL, multihit */ for (i = 0; i < infocnt; ++i) { /* Create processing pipeline and hit list */ info[i].th = p7_tophits_Create(); info[i].om = p7_oprofile_Clone(om); info[i].pli = p7_pipeline_Create(go, om->M, 100, FALSE, p7_SEARCH_SEQS); /* L_hint = 100 is just a dummy for now */ P7_PIPELINE *pli = info[i].pli; pli->nmodels++; pli->nnodes += info[i].om->M; // if (pli->Z_setby == p7_ZSETBY_NTARGETS && pli->mode == p7_SCAN_MODELS) pli->Z = pli->nmodels; // if (pli->do_biasfilter) p7_bg_SetFilter(info[i].bg, info[i].om->M, info[i].om->compo); // if (pli->mode == p7_SEARCH_SEQS) // status = p7_pli_NewModelThresholds(pli, info[i].om); pli->W = info[i].om->max_length; } sstatus = serial_loop(info, dbfp, cfg->n_targetseq, ofp); switch(sstatus) { case eslEFORMAT: esl_fatal("Parse failed (sequence file %s):\n%s\n", dbfp->filename, esl_sqfile_GetErrorBuf(dbfp)); break; case eslEOF: /* do nothing */ break; default: esl_fatal("Unexpected error %d reading sequence file %s", sstatus, dbfp->filename); } /* merge the results of the search results */ for (i = 1; i < infocnt; ++i) { p7_tophits_Merge(info[0].th, info[i].th); p7_pipeline_Merge(info[0].pli, info[i].pli); p7_pipeline_Destroy(info[i].pli); p7_tophits_Destroy(info[i].th); p7_oprofile_Destroy(info[i].om); } /* Print the results. */ p7_tophits_SortBySortkey(info->th); p7_tophits_Threshold(info->th, info->pli); // p7_tophits_Targets(ofp, info->th, info->pli, textw); if (fprintf(ofp, "\n\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); // p7_tophits_Domains(ofp, info->th, info->pli, textw); if (fprintf(ofp, "\n\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); esl_stopwatch_Stop(w); // p7_pli_Statistics(ofp, info->pli, w); // if (fprintf(ofp, "//\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); p7_pipeline_Destroy(info->pli); p7_tophits_Destroy(info->th); p7_oprofile_Destroy(info->om); p7_oprofile_Destroy(om); p7_profile_Destroy(gm); p7_hmm_Destroy(hmm); // hstatus = p7_hmmfile_Read(hfp, &abc, &hmm); // } /* end outer loop over query HMMs */ switch(hstatus) { case eslEOD: p7_Fail("read failed, HMM file %s may be truncated?", cfg->hmmfile); break; case eslEFORMAT: p7_Fail("bad file format in HMM file %s", cfg->hmmfile); break; case eslEINCOMPAT: p7_Fail("HMM file %s contains different alphabets", cfg->hmmfile); break; case eslEOF: case eslOK: /* do nothing. EOF is what we want. */ break; default: p7_Fail("Unexpected error (%d) in reading HMMs from %s", hstatus, cfg->hmmfile); } /* Terminate outputs... any last words? */ /* Cleanup - prepare for exit */ // for (i = 0; i < infocnt; ++i) // p7_bg_Destroy(info[i].bg); free(info); p7_hmmfile_Close(hfp); esl_sqfile_Close(dbfp); esl_alphabet_Destroy(abc); esl_stopwatch_Destroy(w); if (ofp != stdout) fclose(ofp); printf("44HHHH \n"); return eslOK; ERROR: return eslFAIL; }
int p7_seqcache_Open(char *seqfile, P7_SEQCACHE **ret_cache, char *errbuf) { int i; int inx; int val; int status; int32_t seq_cnt; int32_t db_cnt; int32_t db_inx[32]; uint32_t db_key; uint64_t res_cnt; uint64_t res_size; uint64_t hdr_size; char *hdr_ptr; char *res_ptr; char *desc_ptr; char *ptr; char buffer[512]; off_t offset; uint64_t total_mem; SEQ_DB *db = NULL; P7_SEQCACHE *cache = NULL; ESL_RANDOMNESS *rnd = NULL; ESL_SQFILE *sqfp = NULL; ESL_SQ *sq = NULL; ESL_ALPHABET *abc = NULL; ESL_SQASCII_DATA *ascii = NULL; if (errbuf) errbuf[0] = '\0'; /* CURRENTLY UNUSED. FIXME */ /* Open the target sequence database */ if ((status = esl_sqfile_Open(seqfile, eslSQFILE_FASTA, NULL, &sqfp)) != eslOK) return status; /* This is a bit of a hack. The first line contains database information. * * #<res_count> <seq_count> <db_count> <db_sequences_1> <db_sequences_before_removing_duplicates_1> <db_sequences_2> <db_sequences_before_removing_duplicates_2> ... <date_stamp> * * The rest of the file is a fasta format. The fasta header is just * sequence number followed by a binary number indicating which * database this sequence occurs in. * * The header line will be read in, parsed and saved. Then the * parser will be repositioned after the line and used normally. */ ascii = &sqfp->data.ascii; fseek(ascii->fp, 0L, SEEK_SET); if (fgets(buffer, sizeof(buffer), ascii->fp) == NULL) return eslEFORMAT; if (buffer[0] != '#') return eslEFORMAT; ptr = buffer + 1; res_cnt = strtoll(ptr, &ptr, 10); seq_cnt = strtol(ptr, &ptr, 10); db_cnt = strtol(ptr, &ptr, 10); if (db_cnt > (sizeof(db_inx)/sizeof(db_inx[0]))) return eslEFORMAT; total_mem = sizeof(P7_SEQCACHE); ESL_ALLOC(cache, sizeof(P7_SEQCACHE)); memset(cache, 0, sizeof(P7_SEQCACHE)); if (esl_strdup(seqfile, -1, &cache->name) != eslOK) goto ERROR; total_mem += (sizeof(HMMER_SEQ) * seq_cnt); ESL_ALLOC(cache->list, sizeof(HMMER_SEQ) * seq_cnt); memset(cache->list, 0, sizeof(HMMER_SEQ) * seq_cnt); total_mem += (sizeof(SEQ_DB) * db_cnt); ESL_ALLOC(db, sizeof(SEQ_DB) * db_cnt); for (i = 0; i < db_cnt; ++i) { db[i].count = strtol(ptr, &ptr, 10); db[i].K = strtol(ptr, &ptr, 10); total_mem += (sizeof(HMMER_SEQ *) * db[i].count); ESL_ALLOC(db[i].list, sizeof(HMMER_SEQ *) * db[i].count); memset(db[i].list, 0, sizeof(HMMER_SEQ *) * db[i].count); } /* grab the unique identifier */ while (*ptr && isspace(*ptr)) ++ptr; i = strlen(ptr); ESL_ALLOC(cache->id, i+1); strcpy(cache->id, ptr); while (--i > 0 && isspace(cache->id[i])) cache->id[i] = 0; res_size = res_cnt + seq_cnt + 1; hdr_size = seq_cnt * 10; total_mem += res_size + hdr_size; ESL_ALLOC(cache->residue_mem, res_size); ESL_ALLOC(cache->header_mem, hdr_size); /* position the sequence file to the start of the first sequence. * this will force any buffers associated with the file to be reset. */ offset = ftell(ascii->fp); if ((status = esl_sqfile_Position(sqfp, offset)) != eslOK) goto ERROR; abc = esl_alphabet_Create(eslAMINO); sq = esl_sq_CreateDigital(abc); cache->db_cnt = db_cnt; cache->db = db; cache->abc = abc; cache->res_size = res_size; cache->hdr_size = hdr_size; cache->count = seq_cnt; hdr_ptr = cache->header_mem; res_ptr = cache->residue_mem; for (i = 0; i < db_cnt; ++i) db_inx[i] = 0; strcpy(buffer, "000000001"); inx = 0; while ((status = esl_sqio_Read(sqfp, sq)) == eslOK) { /* sanity checks */ if (inx >= seq_cnt) { printf("inx: %d\n", inx); return eslEFORMAT; } if (sq->n + 1 > res_size) { printf("inx: %d size %d %d\n", inx, (int)sq->n + 1, (int)res_size); return eslEFORMAT; } if (hdr_size <= 0) { printf("inx: %d hdr %d\n", inx, (int)hdr_size); return eslEFORMAT; } /* generate the database key - modified to take the first word in the desc line. * The remaining part of the desc is then cached as the description. */ ptr = sq->desc;; desc_ptr = strchr(sq->desc, ' '); if(desc_ptr != NULL) { *desc_ptr= '\0'; ++desc_ptr; } val = 1; db_key = 0; while (*ptr) { if (*ptr == '1') db_key += val; val <<= 1; ++ptr; } if (db_key >= (1 << (db_cnt + 1))) { printf("inx: %d db %d %s\n", inx, db_key, sq->desc); return eslEFORMAT; } cache->list[inx].name = hdr_ptr; cache->list[inx].dsq = (ESL_DSQ *)res_ptr; cache->list[inx].n = sq->n; cache->list[inx].idx = inx; cache->list[inx].db_key = db_key; if(desc_ptr != NULL) esl_strdup(desc_ptr, -1, &(cache->list[inx].desc)); /* copy the digitized sequence */ memcpy(res_ptr, sq->dsq, sq->n + 1); res_ptr += (sq->n + 1); res_size -= (sq->n + 1); /* copy the index to the header */ strcpy(hdr_ptr, buffer); hdr_ptr += 10; hdr_size -= 10; /* increment the buffer string */ ++buffer[8]; for (i = 8; i > 0; --i) { if (buffer[i] > '9') { buffer[i] = '0'; buffer[i-1]++; } } esl_sq_Reuse(sq); ++inx; } if (status != eslEOF) { printf("Unexpected error %d at %d\n", status, inx); return status; } if (inx != seq_cnt) { printf("inx:: %d %d\n", inx, seq_cnt); return eslEFORMAT; } if (hdr_size != 0) { printf("inx:: %d hdr %d\n", inx, (int)hdr_size); return eslEFORMAT; } if (res_size != 1) { printf("inx:: %d size %d %d\n", inx, (int)sq->n + 1, (int)res_size); return eslEFORMAT; } /* copy the final sentinel character */ *res_ptr++ = eslDSQ_SENTINEL; --res_size; /* sort the order of the database sequences */ rnd = esl_randomness_CreateFast(seq_cnt); for (i = 0 ; i < seq_cnt; ++i) { rnd->x = rnd->x * 69069 + 1; cache->list[i].idx = rnd->x; } esl_randomness_Destroy(rnd); qsort(cache->list, seq_cnt, sizeof(HMMER_SEQ), sort_seq); /* fill in the different databases and fix the index */ for (i = 0 ; i < seq_cnt; ++i) { inx = 0; db_key = cache->list[i].db_key; while (db_key) { if (db_key & 1) { SEQ_DB *db = cache->db + inx; if (db_inx[inx] >= db->count) { printf("sort:: %d %d\n", db_inx[inx], db->count); return eslEFORMAT; } db->list[db_inx[inx]] = &cache->list[i]; ++db_inx[inx]; } db_key >>= 1; ++inx; } cache->list[i].idx = (cache->list[i].name - cache->header_mem) / 10 + 1; } for (i = 0; i < cache->db_cnt; ++i) { printf("sequence database (%d):: %d %d\n", i, cache->db[i].count, db_inx[i]); } printf("\nLoaded sequence db file %s; total memory %" PRId64 "\n", seqfile, total_mem); esl_sqfile_Close(sqfp); esl_sq_Destroy(sq); *ret_cache = cache; return eslOK; ERROR: if (sq != NULL) esl_sq_Destroy(sq); if (abc != NULL) esl_alphabet_Destroy(abc); if (cache != NULL) { if (cache->header_mem != NULL) free(cache->header_mem); if (cache->residue_mem != NULL) free(cache->residue_mem); if (cache->name != NULL) free(cache->name); if (cache->id != NULL) free(cache->id); free(cache); } for (i = 0; i < db_cnt; ++i) { if (db[i].list != NULL) free(db[i].list); } return eslEMEM; }