/* Function: esl_keyhash_Clone() * Synopsis: Duplicates a keyhash. * * Purpose: Allocates and duplicates a keyhash <kh>. Returns a * pointer to the duplicate. * * Throws: <NULL> on allocation failure. */ ESL_KEYHASH * esl_keyhash_Clone(const ESL_KEYHASH *kh) { ESL_KEYHASH *nw; int h; if ((nw = keyhash_create(kh->hashsize, kh->kalloc, kh->salloc)) == NULL) goto ERROR; for (h = 0; h < kh->hashsize; h++) nw->hashtable[h] = kh->hashtable[h]; for (h = 0; h < kh->nkeys; h++) { nw->nxt[h] = kh->nxt[h]; nw->key_offset[h] = kh->key_offset[h]; } nw->nkeys = kh->nkeys; memcpy(nw->smem, kh->smem, sizeof(char) * kh->sn); nw->sn = kh->sn; return nw; ERROR: esl_keyhash_Destroy(nw); return NULL; }
/* keyhash_create() * * The real creation function, which takes arguments for memory sizes. * This is abstracted to a static function because it's used by both * Create() and Clone() but slightly differently. * * Args: hashsize - size of hash table; this must be a power of two. * init_key_alloc - initial allocation for # of keys. * init_string_alloc - initial allocation for total size of key strings. * * Returns: An allocated hash table structure; or NULL on failure. */ ESL_KEYHASH * keyhash_create(uint32_t hashsize, int init_key_alloc, int init_string_alloc) { ESL_KEYHASH *kh = NULL; int i; int status; ESL_ALLOC(kh, sizeof(ESL_KEYHASH)); kh->hashtable = NULL; kh->key_offset = NULL; kh->nxt = NULL; kh->smem = NULL; kh->hashsize = hashsize; kh->kalloc = init_key_alloc; kh->salloc = init_string_alloc; ESL_ALLOC(kh->hashtable, sizeof(int) * kh->hashsize); for (i = 0; i < kh->hashsize; i++) kh->hashtable[i] = -1; ESL_ALLOC(kh->key_offset, sizeof(int) * kh->kalloc); ESL_ALLOC(kh->nxt, sizeof(int) * kh->kalloc); for (i = 0; i < kh->kalloc; i++) kh->nxt[i] = -1; ESL_ALLOC(kh->smem, sizeof(char) * kh->salloc); kh->nkeys = 0; kh->sn = 0; return kh; ERROR: esl_keyhash_Destroy(kh); return NULL; }
/* multifetch: * given a file containing lines with one name or key per line; * parse the file line-by-line; * if we have an SSI index available, retrieve the HMMs by key * as we see each line; * else, without an SSI index, store the keys in a hash, then * read the entire HMM file in a single pass, outputting HMMs * that are in our keylist. * * Note that with an SSI index, you get the HMMs in the order they * appear in the <keyfile>, but without an SSI index, you get HMMs in * the order they occur in the HMM file. */ static void multifetch(ESL_GETOPTS *go, FILE *ofp, char *keyfile, P7_HMMFILE *hfp) { ESL_KEYHASH *keys = esl_keyhash_Create(); ESL_FILEPARSER *efp = NULL; ESL_ALPHABET *abc = NULL; P7_HMM *hmm = NULL; int nhmm = 0; char *key; int keylen; int keyidx; int status; if (esl_fileparser_Open(keyfile, NULL, &efp) != eslOK) p7_Fail("Failed to open key file %s\n", keyfile); esl_fileparser_SetCommentChar(efp, '#'); while (esl_fileparser_NextLine(efp) == eslOK) { if (esl_fileparser_GetTokenOnLine(efp, &key, &keylen) != eslOK) p7_Fail("Failed to read HMM name on line %d of file %s\n", efp->linenumber, keyfile); status = esl_key_Store(keys, key, &keyidx); if (status == eslEDUP) p7_Fail("HMM key %s occurs more than once in file %s\n", key, keyfile); if (hfp->ssi != NULL) { onefetch(go, ofp, key, hfp); nhmm++; } } if (hfp->ssi == NULL) { while ((status = p7_hmmfile_Read(hfp, &abc, &hmm)) != eslEOF) { if (status == eslEOD) p7_Fail("read failed, HMM file %s may be truncated?", hfp->fname); else if (status == eslEFORMAT) p7_Fail("bad file format in HMM file %s", hfp->fname); else if (status == eslEINCOMPAT) p7_Fail("HMM file %s contains different alphabets", hfp->fname); else if (status != eslOK) p7_Fail("Unexpected error in reading HMMs from %s", hfp->fname); if (esl_key_Lookup(keys, hmm->name, &keyidx) == eslOK || ((hmm->acc) && esl_key_Lookup(keys, hmm->acc, &keyidx) == eslOK)) { p7_hmmfile_WriteASCII(ofp, -1, hmm); nhmm++; } p7_hmm_Destroy(hmm); } } if (ofp != stdout) printf("\nRetrieved %d HMMs.\n", nhmm); if (abc != NULL) esl_alphabet_Destroy(abc); esl_keyhash_Destroy(keys); esl_fileparser_Close(efp); return; }
/* multifetch: * given a file containing lines with one name or key per line; * parse the file line-by-line; * if we have an SSI index available, retrieve the MSAs by key * as we see each line; * else, without an SSI index, store the keys in a hash, then * read the entire MSA file in a single pass, outputting MSAs * that are in our keylist. * * Note that with an SSI index, you get the MSAs in the order they * appear in the <keyfile>, but without an SSI index, you get MSAs in * the order they occur in the MSA file. */ static void multifetch(ESL_GETOPTS *go, FILE *ofp, int outfmt, char *keyfile, ESLX_MSAFILE *afp) { ESL_KEYHASH *keys = esl_keyhash_Create(); ESL_FILEPARSER *efp = NULL; ESL_MSA *msa = NULL; int nali = 0; char *key; int keylen; int keyidx; int status; if (esl_fileparser_Open(keyfile, NULL, &efp) != eslOK) esl_fatal("Failed to open key file %s\n", keyfile); esl_fileparser_SetCommentChar(efp, '#'); while (esl_fileparser_NextLine(efp) == eslOK) { if (esl_fileparser_GetTokenOnLine(efp, &key, &keylen) != eslOK) esl_fatal("Failed to read MSA name on line %d of file %s\n", efp->linenumber, keyfile); status = esl_keyhash_Store(keys, key, keylen, &keyidx); if (status == eslEDUP) esl_fatal("MSA key %s occurs more than once in file %s\n", key, keyfile); if (afp->ssi) { onefetch(go, ofp, outfmt, key, afp); nali++; } } if (! afp->ssi) { while ((status = eslx_msafile_Read(afp, &msa)) != eslEOF) { if (status != eslOK) eslx_msafile_ReadFailure(afp, status); nali++; if (msa->name == NULL) esl_fatal("Every alignment in file must have a name to be retrievable. Failed to find name of alignment #%d\n", nali); if ( (esl_keyhash_Lookup(keys, msa->name, -1, NULL) == eslOK) || (msa->acc != NULL && esl_keyhash_Lookup(keys, msa->acc, -1, NULL) == eslOK)) eslx_msafile_Write(ofp, msa, outfmt); esl_msa_Destroy(msa); } } if (ofp != stdout) printf("\nRetrieved %d alignments.\n", nali); esl_keyhash_Destroy(keys); esl_fileparser_Close(efp); return; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; char *keyfile = NULL; char *tabfile = NULL; ESL_KEYHASH *kh = esl_keyhash_Create(); int nkeys = 0; ESL_DMATRIX *D = NULL; ESL_TREE *T = NULL; go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], go, "Failed to parse command line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) cmdline_failure(argv[0], go, "Error in app configuration: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h") ) cmdline_help (argv[0], go); if (esl_opt_ArgNumber(go) != 2) cmdline_failure(argv[0], go, "Incorrect number of command line arguments.\n"); keyfile = esl_opt_GetArg(go, 1); tabfile = esl_opt_GetArg(go, 2); read_keyfile(go, keyfile, kh); nkeys = esl_keyhash_GetNumber(kh); D = esl_dmatrix_Create(nkeys, nkeys); read_tabfile(go, tabfile, kh, D); esl_tree_SingleLinkage(D, &T); //esl_tree_WriteNewick(stdout, T); output_clusters(go, T, kh); esl_tree_Destroy(T); esl_dmatrix_Destroy(D); esl_keyhash_Destroy(kh); esl_getopts_Destroy(go); return 0; }
/* multifetch: * given a file containing lines with one name or key per line; * parse the file line-by-line; * if we have an SSI index available, retrieve the seqs by key * as we see each line; * else, without an SSI index, store the keys in a hash, then * read the entire seq file in a single pass, outputting seqs * that are in our keylist. * * Note that with an SSI index, you get the seqs in the order they * appear in the <keyfile>, but without an SSI index, you get seqs in * the order they occur in the seq file. */ static void multifetch(ESL_GETOPTS *go, FILE *ofp, char *keyfile, ESL_SQFILE *sqfp) { ESL_KEYHASH *keys = esl_keyhash_Create(); ESL_FILEPARSER *efp = NULL; int nseq = 0; int nkeys = 0; char *key; int keylen; int keyidx; int status; if (esl_fileparser_Open(keyfile, NULL, &efp) != eslOK) esl_fatal("Failed to open key file %s\n", keyfile); esl_fileparser_SetCommentChar(efp, '#'); while (esl_fileparser_NextLine(efp) == eslOK) { if (esl_fileparser_GetTokenOnLine(efp, &key, &keylen) != eslOK) esl_fatal("Failed to read seq name on line %d of file %s\n", efp->linenumber, keyfile); status = esl_keyhash_Store(keys, key, keylen, &keyidx); if (status == eslEDUP) esl_fatal("seq key %s occurs more than once in file %s\n", key, keyfile); /* if we have an SSI index, just fetch them as we go. */ if (sqfp->data.ascii.ssi != NULL) { onefetch(go, ofp, key, sqfp); nseq++; } nkeys++; } /* If we don't have an SSI index, we haven't fetched anything yet; do it now. */ if (sqfp->data.ascii.ssi == NULL) { ESL_SQ *sq = esl_sq_Create(); while ((status = esl_sqio_Read(sqfp, sq)) == eslOK) { if ( (sq->name[0] != '\0' && esl_keyhash_Lookup(keys, sq->name, -1, NULL) == eslOK) || (sq->acc[0] != '\0' && esl_keyhash_Lookup(keys, sq->acc, -1, NULL) == eslOK)) { if (esl_opt_GetBoolean(go, "-r") ) if (esl_sq_ReverseComplement(sq) != eslOK) esl_fatal("Failed to reverse complement %s\n", sq->name); esl_sqio_Write(ofp, sq, eslSQFILE_FASTA, FALSE); nseq++; } esl_sq_Reuse(sq); } if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n", sqfp->filename, esl_sqfile_GetErrorBuf(sqfp)); else if (status != eslEOF) esl_fatal("Unexpected error %d reading sequence file %s", status, sqfp->filename); esl_sq_Destroy(sq); } if (nkeys != nseq) esl_fatal("Tried to retrieve %d keys, but only retrieved %d sequences\n", nkeys, nseq); if (ofp != stdout) printf("\nRetrieved %d sequences.\n", nseq); esl_keyhash_Destroy(keys); esl_fileparser_Close(efp); return; }