/* multifetch: * given a file containing lines with one name or key per line; * parse the file line-by-line; * if we have an SSI index available, retrieve the MSAs by key * as we see each line; * else, without an SSI index, store the keys in a hash, then * read the entire MSA file in a single pass, outputting MSAs * that are in our keylist. * * Note that with an SSI index, you get the MSAs in the order they * appear in the <keyfile>, but without an SSI index, you get MSAs in * the order they occur in the MSA file. */ static void multifetch(ESL_GETOPTS *go, FILE *ofp, int outfmt, char *keyfile, ESLX_MSAFILE *afp) { ESL_KEYHASH *keys = esl_keyhash_Create(); ESL_FILEPARSER *efp = NULL; ESL_MSA *msa = NULL; int nali = 0; char *key; int keylen; int keyidx; int status; if (esl_fileparser_Open(keyfile, NULL, &efp) != eslOK) esl_fatal("Failed to open key file %s\n", keyfile); esl_fileparser_SetCommentChar(efp, '#'); while (esl_fileparser_NextLine(efp) == eslOK) { if (esl_fileparser_GetTokenOnLine(efp, &key, &keylen) != eslOK) esl_fatal("Failed to read MSA name on line %d of file %s\n", efp->linenumber, keyfile); status = esl_keyhash_Store(keys, key, keylen, &keyidx); if (status == eslEDUP) esl_fatal("MSA key %s occurs more than once in file %s\n", key, keyfile); if (afp->ssi) { onefetch(go, ofp, outfmt, key, afp); nali++; } } if (! afp->ssi) { while ((status = eslx_msafile_Read(afp, &msa)) != eslEOF) { if (status != eslOK) eslx_msafile_ReadFailure(afp, status); nali++; if (msa->name == NULL) esl_fatal("Every alignment in file must have a name to be retrievable. Failed to find name of alignment #%d\n", nali); if ( (esl_keyhash_Lookup(keys, msa->name, -1, NULL) == eslOK) || (msa->acc != NULL && esl_keyhash_Lookup(keys, msa->acc, -1, NULL) == eslOK)) eslx_msafile_Write(ofp, msa, outfmt); esl_msa_Destroy(msa); } } if (ofp != stdout) printf("\nRetrieved %d alignments.\n", nali); esl_keyhash_Destroy(keys); esl_fileparser_Close(efp); return; }
/* Function: p7_tophits_CompareRanking() * Synopsis: Compare current top hits to previous top hits ranking. * * Purpose: Using a keyhash <kh> of the previous top hits and the * their ranks, look at the current top hits list <th> * and flag new hits that are included for the first time * (by setting <p7_IS_NEW> flag) and hits that were * included previously, but are now below the inclusion * threshold in the list (<by setting <p7_IS_DROPPED> * flag). * * The <th> must already have been processed by * <p7_tophits_Threshold()>. We assume the <is_included>, * <is_reported> flags are set on the appropriate hits. * * Upon return, the keyhash <kh> is updated to hash the * current top hits list and their ranks. * * Optionally, <*opt_nnew> is set to the number of * newly included hits. jackhmmer uses this as part of * its convergence criteria, for example. * * These flags affect output of top target hits from * <p7_tophits_Targets()>. * * It only makes sense to call this function in context of * an iterative search. * * The <p7_IS_NEW> flag is comprehensive: all new hits * are flagged (and counted in <*opt_nnew>). The <p7_WAS_DROPPED> * flag is not comprehensive: only those hits that still * appear in the current top hits list are flagged. If a * hit dropped entirely off the list, it isn't counted * as "dropped". (This could be done, but we would want * to have two keyhashes, one old and one new, to do the * necessary comparisons efficiently.) * * If the target names in <th> are not unique, results may * be strange. * * Args: th - current top hits list * kh - hash of top hits' ranks (in: previous tophits; out: <th>'s tophits) * opt_nnew - optRETURN: number of new hits above inclusion threshold * * Returns: <eslOK> on success. * * Throws: <eslEMEM> if <kh> needed to be reallocated but this failed. */ int p7_tophits_CompareRanking(P7_TOPHITS *th, ESL_KEYHASH *kh, int *opt_nnew) { int nnew = 0; int oldrank; int h; int status; /* Flag the hits in the list with whether they're new in the included top hits, * and whether they've dropped off the included list. */ for (h = 0; h < th->N; h++) { esl_keyhash_Lookup(kh, th->hit[h]->name, -1, &oldrank); if (th->hit[h]->flags & p7_IS_INCLUDED) { if (oldrank == -1) { th->hit[h]->flags |= p7_IS_NEW; nnew++; } } else { if (oldrank >= 0) th->hit[h]->flags |= p7_IS_DROPPED; } } /* Replace the old rank list with the new one */ esl_keyhash_Reuse(kh); for (h = 0; h < th->N; h++) { if (th->hit[h]->flags & p7_IS_INCLUDED) { /* What happens when the same sequence name appears twice? It gets stored with higher rank */ status = esl_keyhash_Store(kh, th->hit[h]->name, -1, NULL); if (status != eslOK && status != eslEDUP) goto ERROR; } } if (opt_nnew != NULL) *opt_nnew = nnew; return eslOK; ERROR: if (opt_nnew != NULL) *opt_nnew = 0; return status; }
/* Function: esl_msashuffle_PermuteSequenceOrder() * Synopsis: Permutes the order of the sequences. * * Purpose: Randomly permute the order of the sequences in <msa>, * and any associated sequence annotation, in place. * * Returns: <eslOK> on success. * * Throws: (no abnormal error conditions) */ int esl_msashuffle_PermuteSequenceOrder(ESL_RANDOMNESS *r, ESL_MSA *msa) { void *tmp; double tmpwgt; int64_t tmplen; int N, i, tag; for (N = msa->nseq; N > 1; N--) { i = esl_rnd_Roll(r, N); /* idx = 0..N-1 */ if ( ! (msa->flags & eslMSA_DIGITAL)) { tmp = msa->aseq[i]; msa->aseq[i] = msa->aseq[N-1]; msa->aseq[N-1] = tmp; } #ifdef eslAUGMENT_ALPHABET else { tmp = msa->ax[i]; msa->ax[i] = msa->ax[N-1]; msa->ax[N-1] = tmp; } #endif tmp = msa->sqname[i]; msa->sqname[i] = msa->sqname[N-1]; msa->sqname[N-1] = tmp; tmpwgt = msa->wgt[i]; msa->wgt[i] = msa->wgt[N-1]; msa->wgt[N-1] = tmpwgt; if (msa->sqacc) { tmp = msa->sqacc[i]; msa->sqacc[i] = msa->sqacc[N-1]; msa->sqacc[N-1] = tmp; } if (msa->sqdesc) { tmp = msa->sqdesc[i]; msa->sqdesc[i] = msa->sqdesc[N-1]; msa->sqdesc[N-1] = tmp; } if (msa->ss) { tmp = msa->ss[i]; msa->ss[i] = msa->ss[N-1]; msa->ss[N-1] = tmp; } if (msa->sa) { tmp = msa->sa[i]; msa->sa[i] = msa->sa[N-1]; msa->sa[N-1] = tmp; } if (msa->pp) { tmp = msa->pp[i]; msa->pp[i] = msa->pp[N-1]; msa->pp[N-1] = tmp; } if (msa->sqlen) { tmplen = msa->sqlen[i]; msa->sqlen[i] = msa->sqlen[N-1]; msa->sqlen[N-1] = tmplen; } if (msa->sslen) { tmplen = msa->sslen[i]; msa->sslen[i] = msa->sslen[N-1]; msa->sslen[N-1] = tmplen; } if (msa->salen) { tmplen = msa->salen[i]; msa->salen[i] = msa->salen[N-1]; msa->salen[N-1] = tmplen; } if (msa->pplen) { tmplen = msa->pplen[i]; msa->pplen[i] = msa->pplen[N-1]; msa->pplen[N-1] = tmplen; } for (tag = 0; tag < msa->ngs; tag++) if (msa->gs[tag]) { tmp = msa->gs[tag][i]; msa->gs[tag][i] = msa->gs[tag][N-1]; msa->gs[tag][N-1] = tmp; } for (tag = 0; tag < msa->ngr; tag++) if (msa->gr[tag]) { tmp = msa->gr[tag][i]; msa->gr[tag][i] = msa->gr[tag][N-1]; msa->gr[tag][N-1] = tmp; } } /* if <msa> has a keyhash that maps seqname => seqidx, we'll need to rebuild it. */ if (msa->index) { esl_keyhash_Reuse(msa->index); for (i = 0; i < msa->nseq; i++) esl_keyhash_Store(msa->index, msa->sqname[i], -1, NULL); } return eslOK; }
static void read_keyfile(ESL_GETOPTS *go, char *keyfile, ESL_KEYHASH *kh) { ESL_FILEPARSER *efp = NULL; int nline = 0; char *tok = NULL; int toklen; int status; if (esl_fileparser_Open(keyfile, NULL, &efp) != eslOK) esl_fatal("File open failed"); esl_fileparser_SetCommentChar(efp, '#'); while (esl_fileparser_NextLine(efp) == eslOK) { nline++; if (esl_fileparser_GetTokenOnLine(efp, &tok, &toklen) != eslOK) esl_fatal("No token found on line %d", nline); status = esl_keyhash_Store(kh, tok, toklen, NULL); if (status == eslEDUP) esl_fatal("Saw key %s twice: keys must be unique", tok); else if (status != eslOK) esl_fatal("unknown error in storing key %s\n", tok); } esl_fileparser_Close(efp); }
/* multifetch: * given a file containing lines with one name or key per line; * parse the file line-by-line; * if we have an SSI index available, retrieve the seqs by key * as we see each line; * else, without an SSI index, store the keys in a hash, then * read the entire seq file in a single pass, outputting seqs * that are in our keylist. * * Note that with an SSI index, you get the seqs in the order they * appear in the <keyfile>, but without an SSI index, you get seqs in * the order they occur in the seq file. */ static void multifetch(ESL_GETOPTS *go, FILE *ofp, char *keyfile, ESL_SQFILE *sqfp) { ESL_KEYHASH *keys = esl_keyhash_Create(); ESL_FILEPARSER *efp = NULL; int nseq = 0; int nkeys = 0; char *key; int keylen; int keyidx; int status; if (esl_fileparser_Open(keyfile, NULL, &efp) != eslOK) esl_fatal("Failed to open key file %s\n", keyfile); esl_fileparser_SetCommentChar(efp, '#'); while (esl_fileparser_NextLine(efp) == eslOK) { if (esl_fileparser_GetTokenOnLine(efp, &key, &keylen) != eslOK) esl_fatal("Failed to read seq name on line %d of file %s\n", efp->linenumber, keyfile); status = esl_keyhash_Store(keys, key, keylen, &keyidx); if (status == eslEDUP) esl_fatal("seq key %s occurs more than once in file %s\n", key, keyfile); /* if we have an SSI index, just fetch them as we go. */ if (sqfp->data.ascii.ssi != NULL) { onefetch(go, ofp, key, sqfp); nseq++; } nkeys++; } /* If we don't have an SSI index, we haven't fetched anything yet; do it now. */ if (sqfp->data.ascii.ssi == NULL) { ESL_SQ *sq = esl_sq_Create(); while ((status = esl_sqio_Read(sqfp, sq)) == eslOK) { if ( (sq->name[0] != '\0' && esl_keyhash_Lookup(keys, sq->name, -1, NULL) == eslOK) || (sq->acc[0] != '\0' && esl_keyhash_Lookup(keys, sq->acc, -1, NULL) == eslOK)) { if (esl_opt_GetBoolean(go, "-r") ) if (esl_sq_ReverseComplement(sq) != eslOK) esl_fatal("Failed to reverse complement %s\n", sq->name); esl_sqio_Write(ofp, sq, eslSQFILE_FASTA, FALSE); nseq++; } esl_sq_Reuse(sq); } if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n", sqfp->filename, esl_sqfile_GetErrorBuf(sqfp)); else if (status != eslEOF) esl_fatal("Unexpected error %d reading sequence file %s", status, sqfp->filename); esl_sq_Destroy(sq); } if (nkeys != nseq) esl_fatal("Tried to retrieve %d keys, but only retrieved %d sequences\n", nkeys, nseq); if (ofp != stdout) printf("\nRetrieved %d sequences.\n", nseq); esl_keyhash_Destroy(keys); esl_fileparser_Close(efp); return; }