Ejemplo n.º 1
0
/* multifetch:
 * given a file containing lines with one name or key per line;
 * parse the file line-by-line;
 * if we have an SSI index available, retrieve the MSAs by key
 * as we see each line;
 * else, without an SSI index, store the keys in a hash, then
 * read the entire MSA file in a single pass, outputting MSAs
 * that are in our keylist. 
 * 
 * Note that with an SSI index, you get the MSAs in the order they
 * appear in the <keyfile>, but without an SSI index, you get MSAs in
 * the order they occur in the MSA file.
 */
static void
multifetch(ESL_GETOPTS *go, FILE *ofp, int outfmt, char *keyfile, ESLX_MSAFILE *afp)
{
  ESL_KEYHASH    *keys   = esl_keyhash_Create();
  ESL_FILEPARSER *efp    = NULL;
  ESL_MSA        *msa    = NULL;
  int             nali   = 0;
  char           *key;
  int             keylen;
  int             keyidx;
  int             status;
  
  if (esl_fileparser_Open(keyfile, NULL, &efp) != eslOK) 
    esl_fatal("Failed to open key file %s\n", keyfile);
  esl_fileparser_SetCommentChar(efp, '#');

  while (esl_fileparser_NextLine(efp) == eslOK)
    {
      if (esl_fileparser_GetTokenOnLine(efp, &key, &keylen) != eslOK)
	esl_fatal("Failed to read MSA name on line %d of file %s\n", efp->linenumber, keyfile);
      
      status = esl_keyhash_Store(keys, key, keylen, &keyidx);
      if (status == eslEDUP) esl_fatal("MSA key %s occurs more than once in file %s\n", key, keyfile);
	
      if (afp->ssi) { onefetch(go, ofp, outfmt, key, afp);  nali++; }

    }

  if (! afp->ssi)
    {
      while ((status = eslx_msafile_Read(afp, &msa)) != eslEOF)
	{
	  if (status != eslOK) eslx_msafile_ReadFailure(afp, status);
	  nali++;

	  if (msa->name == NULL) 
	    esl_fatal("Every alignment in file must have a name to be retrievable. Failed to find name of alignment #%d\n", nali);

	  if ( (esl_keyhash_Lookup(keys, msa->name, -1, NULL) == eslOK) ||
	       (msa->acc != NULL && esl_keyhash_Lookup(keys, msa->acc, -1, NULL) == eslOK))
	    eslx_msafile_Write(ofp, msa, outfmt);

	  esl_msa_Destroy(msa);
	}
    }
  
  if (ofp != stdout) printf("\nRetrieved %d alignments.\n", nali);
  esl_keyhash_Destroy(keys);
  esl_fileparser_Close(efp);
  return;
}
Ejemplo n.º 2
0
static void
read_tabfile(ESL_GETOPTS *go, char *tabfile, ESL_KEYHASH *kh, ESL_DMATRIX *D)
{
  ESL_FILEPARSER *efp      = NULL;
  int             nline    = 0;
  int             vfield   = esl_opt_GetInteger(go, "-v");
  int             qfield   = esl_opt_GetInteger(go, "-q");
  int             tfield   = esl_opt_GetInteger(go, "-t");
  char           *tok;
  int             toklen;
  int             ntok;
  double          value;
  int             qidx, tidx;
  
  if (esl_fileparser_Open(tabfile, NULL, &efp) != eslOK) esl_fatal("File open failed");
  esl_fileparser_SetCommentChar(efp, '#');

  esl_dmatrix_Set(D, eslINFINITY);

  while (esl_fileparser_NextLine(efp) == eslOK)
    {
      nline++;
      ntok  = 0;
      qidx  = tidx = -1;
      value = eslNaN;
      while (esl_fileparser_GetTokenOnLine(efp, &tok, &toklen) == eslOK)
	{
	  ntok++;
	  if (ntok == vfield)  value = atof(tok);
	  if (ntok == qfield && esl_keyhash_Lookup(kh, tok, toklen, &qidx) != eslOK) esl_fatal("failed to find query key %s", tok);
	  if (ntok == tfield && esl_keyhash_Lookup(kh, tok, toklen, &tidx) != eslOK) esl_fatal("failed to find target key %s", tok);
	}
      if (qidx  == -1)  esl_fatal("Failed to find query name on line %d (looking for field %d)\n",  nline, qfield);
      if (tidx  == -1)  esl_fatal("Failed to find target name on line %d (looking for field %d)\n", nline, tfield);
      if (isnan(value)) esl_fatal("Failed to find value on line %d (looking for field %d)\n",       nline, vfield);

      D->mx[qidx][tidx] = value;
      if (D->mx[tidx][qidx] == eslINFINITY) D->mx[tidx][qidx] = value;
    }

  esl_fileparser_Close(efp);
}
Ejemplo n.º 3
0
/* Function:  p7_tophits_CompareRanking()
 * Synopsis:  Compare current top hits to previous top hits ranking.
 *
 * Purpose:   Using a keyhash <kh> of the previous top hits and the
 *            their ranks, look at the current top hits list <th>
 *            and flag new hits that are included for the first time
 *            (by setting <p7_IS_NEW> flag) and hits that were 
 *            included previously, but are now below the inclusion
 *            threshold in the list (<by setting <p7_IS_DROPPED>
 *            flag). 
 *
 *            The <th> must already have been processed by
 *            <p7_tophits_Threshold()>. We assume the <is_included>,
 *            <is_reported> flags are set on the appropriate hits.
 * 
 *            Upon return, the keyhash <kh> is updated to hash the
 *            current top hits list and their ranks. 
 *            
 *            Optionally, <*opt_nnew> is set to the number of 
 *            newly included hits. jackhmmer uses this as part of
 *            its convergence criteria, for example.
 *            
 *            These flags affect output of top target hits from
 *            <p7_tophits_Targets()>. 
 *            
 *            It only makes sense to call this function in context of
 *            an iterative search.
 *            
 *            The <p7_IS_NEW> flag is comprehensive: all new hits
 *            are flagged (and counted in <*opt_nnew>). The <p7_WAS_DROPPED> 
 *            flag is not comprehensive: only those hits that still 
 *            appear in the current top hits list are flagged. If a 
 *            hit dropped entirely off the list, it isn't counted
 *            as "dropped". (This could be done, but we would want
 *            to have two keyhashes, one old and one new, to do the
 *            necessary comparisons efficiently.)
 *            
 *            If the target names in <th> are not unique, results may
 *            be strange.
 *
 * Args:      th         - current top hits list
 *            kh         - hash of top hits' ranks (in: previous tophits; out: <th>'s tophits)
 *            opt_nnew   - optRETURN: number of new hits above inclusion threshold
 *
 * Returns:   <eslOK> on success.
 *
 * Throws:    <eslEMEM> if <kh> needed to be reallocated but this failed.
 */
int
p7_tophits_CompareRanking(P7_TOPHITS *th, ESL_KEYHASH *kh, int *opt_nnew)
{
  int nnew = 0;
  int oldrank;
  int h;
  int status;

  /* Flag the hits in the list with whether they're new in the included top hits,
   * and whether they've dropped off the included list.
   */
  for (h = 0; h < th->N; h++)
  {
    esl_keyhash_Lookup(kh, th->hit[h]->name, -1, &oldrank);
      
    if (th->hit[h]->flags & p7_IS_INCLUDED) 
    {
      if (oldrank == -1) { th->hit[h]->flags |= p7_IS_NEW; nnew++; }
    }
    else 
    {
      if (oldrank >=  0) th->hit[h]->flags |= p7_IS_DROPPED;
    }
  }

  /* Replace the old rank list with the new one */
  esl_keyhash_Reuse(kh);
  for (h = 0; h < th->N; h++)
  {
    if (th->hit[h]->flags & p7_IS_INCLUDED)
    {
      /* What happens when the same sequence name appears twice? It gets stored with higher rank */
      status = esl_keyhash_Store(kh, th->hit[h]->name, -1, NULL);
      if (status != eslOK && status != eslEDUP) goto ERROR;
    }
  }
  
  if (opt_nnew != NULL) *opt_nnew = nnew;
  return eslOK;

 ERROR:
  if (opt_nnew != NULL) *opt_nnew = 0;
  return status;
}
Ejemplo n.º 4
0
/* multifetch:
 * given a file containing lines with one name or key per line;
 * parse the file line-by-line;
 * if we have an SSI index available, retrieve the seqs by key
 * as we see each line;
 * else, without an SSI index, store the keys in a hash, then
 * read the entire seq file in a single pass, outputting seqs
 * that are in our keylist. 
 * 
 * Note that with an SSI index, you get the seqs in the order they
 * appear in the <keyfile>, but without an SSI index, you get seqs in
 * the order they occur in the seq file.
 */
static void
multifetch(ESL_GETOPTS *go, FILE *ofp, char *keyfile, ESL_SQFILE *sqfp)
{
  ESL_KEYHASH    *keys   = esl_keyhash_Create();
  ESL_FILEPARSER *efp    = NULL;
  int             nseq   = 0;
  int             nkeys  = 0;
  char           *key;
  int             keylen;
  int             keyidx;
  int             status;

  
  if (esl_fileparser_Open(keyfile, NULL, &efp) != eslOK)  esl_fatal("Failed to open key file %s\n", keyfile);
  esl_fileparser_SetCommentChar(efp, '#');

  while (esl_fileparser_NextLine(efp) == eslOK)
    {
      if (esl_fileparser_GetTokenOnLine(efp, &key, &keylen) != eslOK)
	esl_fatal("Failed to read seq name on line %d of file %s\n", efp->linenumber, keyfile);
      
      status = esl_keyhash_Store(keys, key, keylen, &keyidx);
      if (status == eslEDUP) esl_fatal("seq key %s occurs more than once in file %s\n", key, keyfile);
	
      /* if we have an SSI index, just fetch them as we go. */
      if (sqfp->data.ascii.ssi != NULL) { onefetch(go, ofp, key, sqfp);  nseq++; }
      nkeys++;
    }

  /* If we don't have an SSI index, we haven't fetched anything yet; do it now. */
  if (sqfp->data.ascii.ssi == NULL) 
    {
      ESL_SQ *sq     = esl_sq_Create();

      while ((status = esl_sqio_Read(sqfp, sq)) == eslOK)
	{
	  if ( (sq->name[0] != '\0' && esl_keyhash_Lookup(keys, sq->name, -1, NULL) == eslOK) ||
	       (sq->acc[0]  != '\0' && esl_keyhash_Lookup(keys, sq->acc,  -1, NULL) == eslOK))
	    {
	      if (esl_opt_GetBoolean(go, "-r") )
		if (esl_sq_ReverseComplement(sq) != eslOK) 
		  esl_fatal("Failed to reverse complement %s\n", sq->name);
	      esl_sqio_Write(ofp, sq, eslSQFILE_FASTA, FALSE);
	      nseq++;
	    }
	  esl_sq_Reuse(sq);
	}
      if      (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n",
					       sqfp->filename, esl_sqfile_GetErrorBuf(sqfp));
      else if (status != eslEOF)     esl_fatal("Unexpected error %d reading sequence file %s",
					       status, sqfp->filename);
      esl_sq_Destroy(sq);
    }
  
  if (nkeys != nseq) esl_fatal("Tried to retrieve %d keys, but only retrieved %d sequences\n", nkeys, nseq);

  if (ofp != stdout) printf("\nRetrieved %d sequences.\n", nseq);

  esl_keyhash_Destroy(keys);
  esl_fileparser_Close(efp);
  return;
}