Beispiel #1
0
/* read_mask_file
 *
 * Given an open file pointer, read the first token of the
 * file and return it as *ret_mask. It must contain only
 * '0' or '1' characters.
 *
 * Returns:  eslOK on success.
 */
int
read_mask_file(char *filename, char *errbuf, char **ret_mask, int *ret_mask_len)
{
  ESL_FILEPARSER *efp    = NULL;
  char           *mask   = NULL;
  char           *tok;
  int             toklen;
  int             n;
  int             status;


  if (esl_fileparser_Open(filename, NULL, &efp) != eslOK) ESL_XFAIL(eslFAIL, errbuf, "failed to open %s in read_mask_file\n", filename);
  esl_fileparser_SetCommentChar(efp, '#');
  
  if((status = esl_fileparser_GetToken(efp, &tok, &toklen)) != eslOK) ESL_XFAIL(eslFAIL, errbuf, "failed to read a single token from %s\n", filename);

  ESL_ALLOC(mask, sizeof(char) * (toklen+1));
  for(n = 0; n < toklen; n++) { 
    if((tok[n] == '0') || (tok[n] == '1')) { 
      mask[n] = tok[n];
    }
    else { ESL_XFAIL(eslFAIL, errbuf, "read a non-0 and non-1 character (%c) in the mask file %s\n", tok[n], filename); }
  }
  mask[n] = '\0';

  *ret_mask     = mask;
  *ret_mask_len = n;
  esl_fileparser_Close(efp);
  return eslOK;
  
 ERROR:
  if (efp)  esl_fileparser_Close(efp);
  if (mask) free(mask);
  return status;
}
Beispiel #2
0
/* Function:  esl_hyperexp_Read()
 *
 * Purpose:   Reads hyperexponential parameters from an open <e>.
 *            which is an <ESL_FILEPARSER> tokenizer for an open stream.
 *            
 *            The first token is <K>, the number of mixture components.
 *            The second token is <mu>, the x offset shared by all components.
 *            Then for each mixture component <k=1..K>, it reads
 *            a mixture coefficient <q[k]> and a decay parameter
 *            <lambda[k]>.
 *            
 *            The <2K+2> data tokens must occur in this order, but
 *            they can be grouped into any number of lines, because the
 *            parser ignores line breaks.
 *            
 *            Anything after a <\#> character on a line is a comment, and
 *            is ignored.
 *            
 * Returns:   <eslOK> on success, and <ret_hxp> points to a new <ESL_HYPEREXP>
 *            object.
 *            <eslEFORMAT> on "normal" parse failure caused by a bad file 
 *            format that's likely the user's fault.
 *
 * Throws:    <eslEMEM> if allocation of the new <ESL_HYPEREXP> fails.
 *
 * 
 * FIXME: All our mixture models (esl_dirichlet, for example) should be
 *        reconciled w/ identical interfaces & behaviour.
 */
int
esl_hyperexp_Read(ESL_FILEPARSER *e, ESL_HYPEREXP **ret_hxp)
{
  ESL_HYPEREXP   *hxp = NULL;
  char           *tok;
  int             status = eslOK;
  int             nc;
  int             k;
  double          sum;

  esl_fileparser_SetCommentChar(e, '#');

  if ((status = esl_fileparser_GetToken(e, &tok, NULL)) != eslOK) goto ERROR;
  nc = atoi(tok);
  if (nc < 1) {  
    sprintf(e->errbuf, "Expected # of components K >= 1 as first token");
    goto ERROR;
  }

  if ((hxp = esl_hyperexp_Create(nc)) == NULL) return eslEMEM; /* percolation */
  
  if ((status = esl_fileparser_GetToken(e, &tok, NULL)) != eslOK) goto ERROR;
  hxp->mu = atof(tok);

  for (k = 0; k < hxp->K; k++)
    {
      if ((status = esl_fileparser_GetToken(e, &tok, NULL)) != eslOK) goto ERROR;
      hxp->q[k] = atof(tok);
      
      if ((status = esl_fileparser_GetToken(e, &tok, NULL)) != eslOK) goto ERROR;
      hxp->lambda[k] = atof(tok);

      if (hxp->q[k] < 0. || hxp->q[k] > 1.) {
	sprintf(e->errbuf, "Expected a mixture coefficient q[k], 0<=q[k]<=1");
	goto ERROR;
      }
      if (hxp->lambda[k] <= 0.) {
	sprintf(e->errbuf, "Expected a lambda parameter, lambda>0");
	goto ERROR;
      }
    }
  sum = esl_vec_DSum(hxp->q, hxp->K);
  if (fabs(sum-1.0) > 0.05) {
    sprintf(e->errbuf, "Expected mixture coefficients to sum to 1");
    goto ERROR;
  }
  esl_vec_DNorm(hxp->q, hxp->K);
  *ret_hxp = hxp;
  return eslOK;

 ERROR:
  esl_hyperexp_Destroy(hxp); 
  return eslEFORMAT;
}
Beispiel #3
0
/* multifetch:
 * given a file containing lines with one name or key per line;
 * parse the file line-by-line;
 * if we have an SSI index available, retrieve the HMMs by key
 * as we see each line;
 * else, without an SSI index, store the keys in a hash, then
 * read the entire HMM file in a single pass, outputting HMMs
 * that are in our keylist. 
 * 
 * Note that with an SSI index, you get the HMMs in the order they
 * appear in the <keyfile>, but without an SSI index, you get HMMs in
 * the order they occur in the HMM file.
 */
static void
multifetch(ESL_GETOPTS *go, FILE *ofp, char *keyfile, P7_HMMFILE *hfp)
{
  ESL_KEYHASH    *keys   = esl_keyhash_Create();
  ESL_FILEPARSER *efp    = NULL;
  ESL_ALPHABET   *abc    = NULL;
  P7_HMM         *hmm    = NULL;
  int             nhmm   = 0;
  char           *key;
  int             keylen;
  int             keyidx;
  int             status;
  
  if (esl_fileparser_Open(keyfile, NULL, &efp) != eslOK)  p7_Fail("Failed to open key file %s\n", keyfile);
  esl_fileparser_SetCommentChar(efp, '#');

  while (esl_fileparser_NextLine(efp) == eslOK)
    {
      if (esl_fileparser_GetTokenOnLine(efp, &key, &keylen) != eslOK)
	p7_Fail("Failed to read HMM name on line %d of file %s\n", efp->linenumber, keyfile);
      
      status = esl_key_Store(keys, key, &keyidx);
      if (status == eslEDUP) p7_Fail("HMM key %s occurs more than once in file %s\n", key, keyfile);
	
      if (hfp->ssi != NULL) { onefetch(go, ofp, key, hfp);  nhmm++; }
    }

  if (hfp->ssi == NULL) 
    {
      while ((status = p7_hmmfile_Read(hfp, &abc, &hmm)) != eslEOF)
	{
	  if      (status == eslEOD)       p7_Fail("read failed, HMM file %s may be truncated?", hfp->fname);
	  else if (status == eslEFORMAT)   p7_Fail("bad file format in HMM file %s",             hfp->fname);
	  else if (status == eslEINCOMPAT) p7_Fail("HMM file %s contains different alphabets",   hfp->fname);
	  else if (status != eslOK)        p7_Fail("Unexpected error in reading HMMs from %s",   hfp->fname);

	  if (esl_key_Lookup(keys, hmm->name, &keyidx) == eslOK || 
	      ((hmm->acc) && esl_key_Lookup(keys, hmm->acc, &keyidx) == eslOK))
	    {
	      p7_hmmfile_WriteASCII(ofp, -1, hmm);
	      nhmm++;
	    }

	  p7_hmm_Destroy(hmm);
	}
    }
  
  if (ofp != stdout) printf("\nRetrieved %d HMMs.\n", nhmm);
  if (abc != NULL) esl_alphabet_Destroy(abc);
  esl_keyhash_Destroy(keys);
  esl_fileparser_Close(efp);
  return;
}
Beispiel #4
0
/* multifetch:
 * given a file containing lines with one name or key per line;
 * parse the file line-by-line;
 * if we have an SSI index available, retrieve the MSAs by key
 * as we see each line;
 * else, without an SSI index, store the keys in a hash, then
 * read the entire MSA file in a single pass, outputting MSAs
 * that are in our keylist. 
 * 
 * Note that with an SSI index, you get the MSAs in the order they
 * appear in the <keyfile>, but without an SSI index, you get MSAs in
 * the order they occur in the MSA file.
 */
static void
multifetch(ESL_GETOPTS *go, FILE *ofp, int outfmt, char *keyfile, ESLX_MSAFILE *afp)
{
  ESL_KEYHASH    *keys   = esl_keyhash_Create();
  ESL_FILEPARSER *efp    = NULL;
  ESL_MSA        *msa    = NULL;
  int             nali   = 0;
  char           *key;
  int             keylen;
  int             keyidx;
  int             status;
  
  if (esl_fileparser_Open(keyfile, NULL, &efp) != eslOK) 
    esl_fatal("Failed to open key file %s\n", keyfile);
  esl_fileparser_SetCommentChar(efp, '#');

  while (esl_fileparser_NextLine(efp) == eslOK)
    {
      if (esl_fileparser_GetTokenOnLine(efp, &key, &keylen) != eslOK)
	esl_fatal("Failed to read MSA name on line %d of file %s\n", efp->linenumber, keyfile);
      
      status = esl_keyhash_Store(keys, key, keylen, &keyidx);
      if (status == eslEDUP) esl_fatal("MSA key %s occurs more than once in file %s\n", key, keyfile);
	
      if (afp->ssi) { onefetch(go, ofp, outfmt, key, afp);  nali++; }

    }

  if (! afp->ssi)
    {
      while ((status = eslx_msafile_Read(afp, &msa)) != eslEOF)
	{
	  if (status != eslOK) eslx_msafile_ReadFailure(afp, status);
	  nali++;

	  if (msa->name == NULL) 
	    esl_fatal("Every alignment in file must have a name to be retrievable. Failed to find name of alignment #%d\n", nali);

	  if ( (esl_keyhash_Lookup(keys, msa->name, -1, NULL) == eslOK) ||
	       (msa->acc != NULL && esl_keyhash_Lookup(keys, msa->acc, -1, NULL) == eslOK))
	    eslx_msafile_Write(ofp, msa, outfmt);

	  esl_msa_Destroy(msa);
	}
    }
  
  if (ofp != stdout) printf("\nRetrieved %d alignments.\n", nali);
  esl_keyhash_Destroy(keys);
  esl_fileparser_Close(efp);
  return;
}
Beispiel #5
0
/* Function:  esl_paml_ReadE()
 * Incept:    SRE, Fri Jul  9 09:27:24 2004 [St. Louis]
 *
 * Purpose:   Read an amino acid rate matrix in PAML format from stream
 *            <fp>. Return it in two pieces: the symmetric E
 *            exchangeability matrix in <E>, and the stationary
 *            probability vector $\pi$ in <pi>.
 *            Caller provides the memory for both <E> and <pi>.  <E>
 *            is a $20 \times 20$ matrix allocated as
 *            <esl_dmatrix_Create(20, 20)>. <pi> is an array with
 *            space for at least 20 doubles.
 *            
 *            The <E> matrix is symmetric for off-diagonal elements:
 *            $E_{ij} = E_{ij}$ for $i \neq j$.  The on-diagonal
 *            elements $E_{ii}$ are not valid and should not be
 *            accessed.  (They are set to zero.)

 *            The rate matrix will later be obtained from <E>
 *            and <pi> as 
 *                $Q_{ij} = E_{ij} \pi_j$ for $i \neq j$ 
 *            and
 *                $Q_{ii} = -\sum_{j \neq i} Q_{ij}$ 
 *            then scaled to units of one
 *            substitution/site; see <esl_ratemx_E2Q()> and
 *            <esl_ratemx_ScaleTo()>.
 *
 *            Data file format: First 190 numbers are a
 *            lower-triangular matrix E of amino acid
 *            exchangeabilities $E_{ij}$. Next 20 numbers are the
 *            amino acid frequencies $\pi_i$. Remainder of the
 *            datafile is ignored.
 *            
 *            The alphabet order in the matrix and the frequency
 *            vector is assumed to be "ARNDCQEGHILKMFPSTWYV"
 *            (alphabetical by three-letter code), which appears to be
 *            PAML's default order. This is transformed to Easel's
 *            "ACDEFGHIKLMNPQRSTVWY" (alphabetical by one-letter code)
 *            in the $E_{ij}$ and $\pi_i$ that are returned.
 *            
 * Args:      fp   - open datafile for reading.
 *            E    - RETURN: E matrix of amino acid exchangeabilities e_ij,
 *                     symmetric (E_ij = E_ji),
 *                     in Easel amino acid alphabet order A..Y.
 *                     Caller provides appropriately allocated space.
 *            pi   - RETURN: \pi_i vector of amino acid frequencies,
 *                    in Easel amino acid alphabet order A..Y.
 *                    Caller provides appropriately allocated space.
 *
 * Returns:   <eslOK> on success.
 *            Returns <eslEOF> on premature end of file (parse failed), in which
 *            case the contents of <E> and <pi> are undefined.
 *            
 * Throws:    <eslEMEM> on internal allocation failure,
 *            and the contents of <E> and <pi> are undefined.
 *
 * Xref:      STL8/p.56.
 */
int
esl_paml_ReadE(FILE *fp, ESL_DMATRIX *E, double *pi)
{
  int             status;
  ESL_FILEPARSER *efp = NULL;
  char           *tok;
  int             i,j;
  char           *pamlorder = "ARNDCQEGHILKMFPSTWYV";
  char           *eslorder  = "ACDEFGHIKLMNPQRSTVWY";
  int             perm[20];

  if ((status =  esl_dmatrix_SetZero(E))                 != eslOK) goto ERROR;
  esl_vec_DSet(pi, 20, 0.);

  if ((efp =    esl_fileparser_Create(fp))               == NULL)  goto ERROR;
  if ((status = esl_fileparser_SetCommentChar(efp, '#')) != eslOK) goto ERROR;

  /* Construct the alphabet permutation we need.
   * perm[i] -> original row/column i goes to row/column perm[i]
   */
   for (i = 0; i < 20; i++)
     perm[i] = (int) (strchr(eslorder, pamlorder[i]) - eslorder);

   /* Read the s_ij matrix data in, permuting as we go. */

   for (i = 1; i < 20; i++)
    for (j = 0; j < i; j++)
      {
	if ((status = esl_fileparser_GetToken(efp, &tok, NULL)) != eslOK) goto ERROR;
	E->mx[perm[i]][perm[j]] = atof(tok);
	E->mx[perm[j]][perm[i]] = E->mx[perm[i]][perm[j]];
      }

   /* Read the pi_i vector in, permuting as we read. */
  for (i = 0; i < 20; i++)
    {
      if ((status = esl_fileparser_GetToken(efp, &tok, NULL)) != eslOK) goto ERROR;
      pi[perm[i]] = atof(tok);
    }

  esl_fileparser_Destroy(efp);
  return eslOK;

 ERROR:
  if (efp != NULL) esl_fileparser_Destroy(efp);
  return status;
}
Beispiel #6
0
static void
read_tabfile(ESL_GETOPTS *go, char *tabfile, ESL_KEYHASH *kh, ESL_DMATRIX *D)
{
  ESL_FILEPARSER *efp      = NULL;
  int             nline    = 0;
  int             vfield   = esl_opt_GetInteger(go, "-v");
  int             qfield   = esl_opt_GetInteger(go, "-q");
  int             tfield   = esl_opt_GetInteger(go, "-t");
  char           *tok;
  int             toklen;
  int             ntok;
  double          value;
  int             qidx, tidx;
  
  if (esl_fileparser_Open(tabfile, NULL, &efp) != eslOK) esl_fatal("File open failed");
  esl_fileparser_SetCommentChar(efp, '#');

  esl_dmatrix_Set(D, eslINFINITY);

  while (esl_fileparser_NextLine(efp) == eslOK)
    {
      nline++;
      ntok  = 0;
      qidx  = tidx = -1;
      value = eslNaN;
      while (esl_fileparser_GetTokenOnLine(efp, &tok, &toklen) == eslOK)
	{
	  ntok++;
	  if (ntok == vfield)  value = atof(tok);
	  if (ntok == qfield && esl_keyhash_Lookup(kh, tok, toklen, &qidx) != eslOK) esl_fatal("failed to find query key %s", tok);
	  if (ntok == tfield && esl_keyhash_Lookup(kh, tok, toklen, &tidx) != eslOK) esl_fatal("failed to find target key %s", tok);
	}
      if (qidx  == -1)  esl_fatal("Failed to find query name on line %d (looking for field %d)\n",  nline, qfield);
      if (tidx  == -1)  esl_fatal("Failed to find target name on line %d (looking for field %d)\n", nline, tfield);
      if (isnan(value)) esl_fatal("Failed to find value on line %d (looking for field %d)\n",       nline, vfield);

      D->mx[qidx][tidx] = value;
      if (D->mx[tidx][qidx] == eslINFINITY) D->mx[tidx][qidx] = value;
    }

  esl_fileparser_Close(efp);
}
Beispiel #7
0
/* Function:  esl_hyperexp_ReadFile()
 *
 * Purpose:   Convenience wrapper around <esl_hyperexp_Read()> that takes
 *            a filename as an argument, instead of an open <ESL_FILEPARSER>.
 *            
 *            This lets you quickly read an object from a file, but it
 *            limits your ability to deal gracefully and flexibly with
 *            'normal' errors like 'file not found' or 'bad file format'.
 *            Here, all errors are fatal.
 *
 * Returns:   <eslOK> on success.
 *
 * Throws:    <eslEMEM> on an allocation failure.
 *            
 *            <eslEFORMAT> on any parse error. Diagnostic information is
 *            unavailable, because the <ESL_FILEPARSER> that's holding 
 *            that information is internal to this function. 
 *            
 *            <eslENOTFOUND> on any failure to open the file.
 */
int
esl_hyperexp_ReadFile(char *filename, ESL_HYPEREXP **ret_hxp)
{
  FILE           *fp;
  ESL_FILEPARSER *e;
  int             status;

  if ((fp = fopen(filename, "r")) == NULL) 
    ESL_EXCEPTION(eslENOTFOUND, "file not found");

  if ((e = esl_fileparser_Create(fp)) == NULL) {
    fclose(fp);
    ESL_EXCEPTION(eslEMEM, "failed to create fileparser");
  }
  esl_fileparser_SetCommentChar(e, '#');

  status = esl_hyperexp_Read(e, ret_hxp);

  esl_fileparser_Destroy(e);
  fclose(fp);
  return status;
}
Beispiel #8
0
static void
multifetch_subseq(ESL_GETOPTS *go, FILE *ofp, char *gdffile, ESL_SQFILE *sqfp)
{
  ESL_FILEPARSER *efp    = NULL;
  char           *newname;
  char           *s;
  int             n1, n2;
  int             start, end;
  char           *source;
 
  if (esl_fileparser_Open(gdffile, NULL, &efp) != eslOK)  esl_fatal("Failed to open key file %s\n", gdffile);
  esl_fileparser_SetCommentChar(efp, '#');

  while (esl_fileparser_NextLine(efp) == eslOK)
    {
      if (esl_fileparser_GetTokenOnLine(efp, &newname, &n1) != eslOK)
	esl_fatal("Failed to read subseq name on line %d of file %s\n", efp->linenumber, gdffile);

      if (esl_fileparser_GetTokenOnLine(efp, &s, NULL) != eslOK)
	esl_fatal("Failed to read start coord on line %d of file %s\n", efp->linenumber, gdffile);
      start = atoi(s);
      if(start <= 0) 
	esl_fatal("Read invalid start coord %d on line %d of file %s (must be positive integer)\n", start, efp->linenumber, gdffile);

      if (esl_fileparser_GetTokenOnLine(efp, &s, NULL) != eslOK)
	esl_fatal("Failed to read end coord on line %d of file %s\n", efp->linenumber, gdffile);
      end   = atoi(s);
      if(end < 0)
	esl_fatal("Read invalid end coord %d on line %d of file %s (must be positive integer, or 0 for full length)\n", end, efp->linenumber, gdffile);

      if (esl_fileparser_GetTokenOnLine(efp, &source, &n2) != eslOK)
	esl_fatal("Failed to read source seq name on line %d of file %s\n", efp->linenumber, gdffile);

      onefetch_subseq(go, ofp, sqfp, newname, source, start, end);
    }
  esl_fileparser_Close(efp);
}
Beispiel #9
0
static void 
read_keyfile(ESL_GETOPTS *go, char *keyfile, ESL_KEYHASH *kh)
{
  ESL_FILEPARSER *efp      = NULL;
  int             nline    = 0;
  char           *tok      = NULL;
  int             toklen;
  int             status;

  if (esl_fileparser_Open(keyfile, NULL, &efp) != eslOK) esl_fatal("File open failed");
  esl_fileparser_SetCommentChar(efp, '#');

  while (esl_fileparser_NextLine(efp) == eslOK)
    {
      nline++;
      if (esl_fileparser_GetTokenOnLine(efp, &tok, &toklen) != eslOK) esl_fatal("No token found on line %d", nline);

      status = esl_keyhash_Store(kh, tok, toklen, NULL);
      if      (status == eslEDUP) esl_fatal("Saw key %s twice: keys must be unique", tok);
      else if (status != eslOK)   esl_fatal("unknown error in storing key %s\n", tok);
    }

  esl_fileparser_Close(efp);
}
Beispiel #10
0
int
main(int argc, char **argv)
{
  ESL_GETOPTS    *go       = NULL;	                /* application configuration       */
  char           *seqfile  = NULL;	                /* sequence file name              */
  char           *maskfile = NULL;	                /* mask coordinate file name       */
  int             infmt    = eslSQFILE_UNKNOWN;         /* format code for seqfile         */
  int             outfmt   = eslSQFILE_FASTA;           /* format code for output seqs     */
  ESL_SQFILE     *sqfp     = NULL;                      /* open sequence file              */
  ESL_FILEPARSER *maskefp  = NULL;	                /* open mask coord file            */
  FILE           *ofp      = NULL;	                /* output stream for masked seqs   */
  char           *source   = NULL;			/* name of current seq to mask     */
  char           *p1, *p2;				/* pointers used in parsing        */
  int64_t         start, end;				/* start, end coord for masking    */
  int64_t         i, j, pos;				/* coords in a sequence            */
  int64_t         overmask;				/* # of extra residues to mask     */
  ESL_SQ         *sq       = esl_sq_Create();		/* current sequence                */
  int             do_fetching;
  int             do_lowercase;
  int             maskchar;
  int             status;		                /* easel return code               */


  /****************************************************************************
   * Parse command line
   ****************************************************************************/

  go = esl_getopts_Create(options);
  if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf);
  if (esl_opt_VerifyConfig(go)               != eslOK) cmdline_failure(argv[0], "Error in configuration: %s\n",       go->errbuf);
  if (esl_opt_GetBoolean(go, "-h") )                   cmdline_help   (argv[0], go);
  if (esl_opt_ArgNumber(go) != 2)                      cmdline_failure(argv[0], "Incorrect number of command line arguments.\n");        

  do_fetching  = esl_opt_GetBoolean(go, "-R");
  do_lowercase = esl_opt_GetBoolean(go, "-l");
  overmask     = (esl_opt_IsOn(go, "-x") ? esl_opt_GetInteger(go, "-x") : 0);
  maskchar     = (esl_opt_IsOn(go, "-m") ? esl_opt_GetChar(go, "-m")    : 'X');

  seqfile  = esl_opt_GetArg(go, 1);
  maskfile = esl_opt_GetArg(go, 2);

  /* Open the <seqfile>: text mode, not digital */
  if (esl_opt_GetString(go, "--informat") != NULL) {
    infmt = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--informat"));
    if (infmt == eslSQFILE_UNKNOWN) cmdline_failure(argv[0], "%s is not a valid input sequence file format for --informat"); 
  }
  status = esl_sqfile_Open(seqfile, infmt, NULL, &sqfp);
  if      (status == eslENOTFOUND) cmdline_failure(argv[0], "Sequence file %s not found.\n",     seqfile);
  else if (status == eslEFORMAT)   cmdline_failure(argv[0], "Format of file %s unrecognized.\n", seqfile);
  else if (status == eslEINVAL)    cmdline_failure(argv[0], "Can't autodetect stdin or .gz.\n");
  else if (status != eslOK)        cmdline_failure(argv[0], "Open failed, code %d.\n", status);

  if(do_fetching)
  {
    status = esl_sqfile_OpenSSI(sqfp, NULL);
    if      (status == eslEFORMAT)   cmdline_failure(argv[0], "SSI index is in incorrect format\n");
    else if (status == eslERANGE)    cmdline_failure(argv[0], "SSI index is in 64-bit format and we can't read it\n");
    else if (status != eslOK)        cmdline_failure(argv[0], "Failed to open SSI index\n");
  }

  /* Open the <maskfile> */
  if (esl_fileparser_Open(maskfile, NULL, &maskefp) != eslOK) 
    cmdline_failure(argv[0], "Failed to open mask coordinate file %s\n", maskfile);
  esl_fileparser_SetCommentChar(maskefp, '#');

  /* Open the output file, if any */
  if (esl_opt_GetString(go, "-o") != NULL)
    {
      if ((ofp = fopen(esl_opt_GetString(go, "-o"), "w")) == NULL)
	cmdline_failure(argv[0], "Failed to open output file %s\n", esl_opt_GetString(go, "-o"));
    }
  else ofp = stdout;

  
  /****************************************************************************
   * Main loop over lines in <maskfile>
   ****************************************************************************/

  /* Read one data line at a time from the <maskfile>; 
   * parse into data fields <seqname> <start> <end> 
   */
  while (esl_fileparser_NextLine(maskefp) == eslOK)
    {
      /* First field is sequence name */
      if (esl_fileparser_GetTokenOnLine(maskefp, &source,  NULL) != eslOK)
	esl_fatal("Failed to read source seq name on line %d of file %s\n", maskefp->linenumber, maskfile);

      /* Get the sequence */
      if (do_fetching)
	{  /* If the <seqfile> is SSI indexed, try to reposition it and read <source> seq by random access */
	  status = esl_sqio_Fetch(sqfp, source, sq);
	  if      (status == eslENOTFOUND) esl_fatal("seq %s not found in SSI index for file %s\n", source, sqfp->filename);
	  else if (status == eslEINVAL)    esl_fatal("No SSI index or can't reposition in file %s\n", sqfp->filename);
	  else if (status == eslEFORMAT)   esl_fatal("Parse failed:\n%s\n", esl_sqfile_GetErrorBuf(sqfp));     
	  else if (status != eslOK)        esl_fatal("Unexpected failure in fetching %s from file %s\n", source, sqfp->filename);
	}
      else 
	{ /* else, assume we're reading sequentially; <sqfile> and <maskfile> have seqs in same order */
	  status = esl_sqio_Read(sqfp, sq);
	  if      (status == eslEOF)      esl_fatal("File %s ended prematurely; didn't find %s\n", sqfp->filename, source);
	  else if (status == eslEFORMAT)  esl_fatal("Parse failed:\n%s\n", esl_sqfile_GetErrorBuf(sqfp));
	  else if (status != eslOK)       esl_fatal("Unexpected error reading sequence file %s\n", sqfp->filename);
	  
	  if ((strcmp(sq->name, source) != 0) && (strcmp(sq->acc, source) != 0))
	    esl_fatal("Sequences in <sqfile> and <maskfile> aren't in same order; try -R");
	}
      
      /* If we're masking by lowercase, first make sure everything's uppercase */
      if (do_lowercase)
	for (pos = 0; pos < sq->n; pos++)
	  if (isalpha(sq->seq[pos]))
	    sq->seq[pos] = toupper(sq->seq[pos]);

      /* Next two fields are <start>, <end> for the masking  */
      /* possible future extension: wrap loop around this, enable multiple masked regions */
      if (esl_fileparser_GetTokenOnLine(maskefp, &p1, NULL) != eslOK)
	esl_fatal("Failed to read start coord on line %d of file %s\n", maskefp->linenumber, maskfile);
      start = strtoll(p1, &p2, 0) - 1;

      if (esl_fileparser_GetTokenOnLine(maskefp, &p2, NULL) != eslOK) 
	esl_fatal("Failed to read end coord on line %d of file %s\n", maskefp->linenumber, maskfile);
      end   = strtoll(p2, &p1, 0) - 1;

      /* Do the masking */
      if (esl_opt_GetBoolean(go, "-r")) /* Reverse masking */
	{ /* leave start..end unmasked; mask prefix 0..start-1, end+1..L-1 */
	  i = 0;
	  j = ESL_MIN(sq->n-1, start - 1 + overmask);
	  for (pos = i; pos <= j; pos++)
	    if (isalpha(sq->seq[pos])) 
	      sq->seq[pos] = (do_lowercase ? tolower(sq->seq[pos]) : maskchar);
	  
	  i = ESL_MAX(0, end + 1 - overmask);
	  j = sq->n-1;
	  for (pos = i; pos <= j; pos++)
	    if (isalpha(sq->seq[pos])) 
	      sq->seq[pos] = (do_lowercase ? tolower(sq->seq[pos]) : maskchar);
	}
      else
	{  /* normal: mask start..end */
	  i = ESL_MAX(0,       start - overmask);
	  j = ESL_MIN(sq->n-1, end   + overmask);
	  for (pos = i; pos <= j; pos++)
	    if (isalpha(sq->seq[pos])) 
	      sq->seq[pos] = (do_lowercase ? tolower(sq->seq[pos]) : maskchar);
	}

      esl_sqio_Write(ofp, sq, outfmt, FALSE);
      esl_sq_Reuse(sq);
    }

  esl_sq_Destroy(sq);
  esl_fileparser_Close(maskefp);
  esl_sqfile_Close(sqfp);
  esl_getopts_Destroy(go);
  if (ofp != stdout) fclose(ofp);
  return 0;
}
Beispiel #11
0
/* Function:  p7_bg_Read()
 * Synopsis:  Read background frequencies from a file.
 *
 * Purpose:   Read new background frequencies from file <bgfile>,
 *            overwriting the frequencies previously in the 
 *            <P7_BG> object <bg>.
 *            
 *            Note that <bg> is already created by the caller, not
 *            created here. Also note that <p7_bg_Read()> only reads
 *            residue background frequencies used for the "null
 *            model", whereas a <P7_BG> object contains additional
 *            information for the bias filter and for the biased
 *            composition correction.
 *            
 * Args:      bgfile  - file to read.
 *            bg      - existing <P7_BG> object provided by the caller.
 *            errbuf  - OPTIONAL: space for an error message, upon parse errors; or NULL.
 *
 * Returns:   <eslOK> on success, and background frequencies in <bg>
 *            are overwritten.
 * 
 *            <eslENOTFOUND> if <bgfile> can't be opened for reading.
 *            <eslEFORMAT> if parsing of <bgfile> fails for some
 *            reason.  In both cases, <errbuf> contains a
 *            user-directed error message upon return, including (if
 *            relevant) the file name <bgfile> and the line number on
 *            which an error was detected. <bg> is unmodified.
 *
 * Throws:    <eslEMEM> on allocation failure; <bg> is unmodified,
 *            and <errbuf> is empty.
 */
int
p7_bg_Read(char *bgfile, P7_BG *bg, char *errbuf)
{
  ESL_FILEPARSER *efp   = NULL;
  float          *fq    = NULL;
  int             n     = 0;
  char           *tok;
  int             toklen;
  int             alphatype;
  ESL_DSQ         x;
  int             status;

  if (errbuf) errbuf[0] = '\0';

  status =  esl_fileparser_Open(bgfile, NULL, &efp);
  if      (status == eslENOTFOUND) ESL_XFAIL(eslENOTFOUND, errbuf, "couldn't open bg file  %s for reading", bgfile);
  else if (status != eslOK)        goto ERROR;

  esl_fileparser_SetCommentChar(efp, '#');

  /* First token is alphabet type: amino | DNA | RNA */
  status = esl_fileparser_GetToken(efp, &tok, &toklen);
  if      (status == eslEOF) ESL_XFAIL(eslEFORMAT, errbuf, "premature end of file [line %d of bgfile %s]", efp->linenumber, bgfile);
  else if (status != eslOK)  goto ERROR;

  alphatype = esl_abc_EncodeType(tok);
  if      (alphatype == eslUNKNOWN)    ESL_XFAIL(eslEFORMAT, errbuf, "expected alphabet type but saw \"%s\" [line %d of bgfile %s]", tok, efp->linenumber, bgfile);
  else if (alphatype != bg->abc->type) ESL_XFAIL(eslEFORMAT, errbuf, "bg file's alphabet is %s; expected %s [line %d, %s]", tok, esl_abc_DecodeType(bg->abc->type), efp->linenumber, bgfile);
  
  ESL_ALLOC(fq, sizeof(float) * bg->abc->K);
  esl_vec_FSet(fq, bg->abc->K, -1.0);

  while ((status = esl_fileparser_NextLine(efp)) == eslOK)
    {
      status = esl_fileparser_GetTokenOnLine(efp, &tok, &toklen);
      if      (status == eslEOL) ESL_XFAIL(eslEFORMAT, errbuf, "premature end of file [line %d of bgfile %s", efp->linenumber, bgfile);
      else if (status != eslOK)  goto ERROR;

      if      (toklen != 1 ||   ! esl_abc_CIsCanonical(bg->abc, *tok))
	ESL_XFAIL(eslEFORMAT, errbuf, "expected to parse a residue letter; saw %s [line %d of bgfile %s]", tok, efp->linenumber, bgfile);

      x = esl_abc_DigitizeSymbol(bg->abc, *tok);
      if (fq[x] != -1.0)         ESL_XFAIL(eslEFORMAT, errbuf, "already parsed probability of %c [line %d of bgfile %s]", bg->abc->sym[x], efp->linenumber, bgfile);
      n++;

      status = esl_fileparser_GetTokenOnLine(efp, &tok, &toklen);
      if      (status == eslEOL) ESL_XFAIL(eslEFORMAT, errbuf, "premature end of file, expected a probability [line %d of bgfile %s]", efp->linenumber, bgfile);
      else if (status != eslOK)  goto ERROR;
      if (! esl_str_IsReal(tok)) ESL_XFAIL(eslEFORMAT, errbuf, "expected a probability, saw %s [line %d of bgfile %s]", tok, efp->linenumber, bgfile);

      fq[x] = atof(tok);

      status = esl_fileparser_GetTokenOnLine(efp, &tok, &toklen);
      if      (status == eslOK)  ESL_XFAIL(eslEFORMAT, errbuf, "extra unexpected data found [line %d of bgfile %s]", efp->linenumber, bgfile);
      else if (status != eslEOL) goto ERROR;
    }
  if (status != eslEOF) goto ERROR;

  if ( n != bg->abc->K) 
    ESL_XFAIL(eslEFORMAT, errbuf, "expected %d residue frequencies, but found %d in bgfile %s", bg->abc->K, n, bgfile);
  if ( esl_FCompare(esl_vec_FSum(fq, bg->abc->K), 1.0, 0.001) != eslOK) 
    ESL_XFAIL(eslEFORMAT, errbuf, "residue frequencies do not sum to 1.0 in bgfile %s", bgfile);
  
  /* all checking complete. no more error cases. overwrite bg with the new frequencies */
  esl_vec_FNorm(fq, bg->abc->K);
  esl_vec_FCopy(fq, bg->abc->K, bg->f);

  free(fq);
  esl_fileparser_Close(efp);
  return eslOK;

 ERROR:
  if (fq)  free(fq);
  if (efp) esl_fileparser_Close(efp);
  return status;
}
Beispiel #12
0
/* multifetch:
 * given a file containing lines with one name or key per line;
 * parse the file line-by-line;
 * if we have an SSI index available, retrieve the seqs by key
 * as we see each line;
 * else, without an SSI index, store the keys in a hash, then
 * read the entire seq file in a single pass, outputting seqs
 * that are in our keylist. 
 * 
 * Note that with an SSI index, you get the seqs in the order they
 * appear in the <keyfile>, but without an SSI index, you get seqs in
 * the order they occur in the seq file.
 */
static void
multifetch(ESL_GETOPTS *go, FILE *ofp, char *keyfile, ESL_SQFILE *sqfp)
{
  ESL_KEYHASH    *keys   = esl_keyhash_Create();
  ESL_FILEPARSER *efp    = NULL;
  int             nseq   = 0;
  int             nkeys  = 0;
  char           *key;
  int             keylen;
  int             keyidx;
  int             status;

  
  if (esl_fileparser_Open(keyfile, NULL, &efp) != eslOK)  esl_fatal("Failed to open key file %s\n", keyfile);
  esl_fileparser_SetCommentChar(efp, '#');

  while (esl_fileparser_NextLine(efp) == eslOK)
    {
      if (esl_fileparser_GetTokenOnLine(efp, &key, &keylen) != eslOK)
	esl_fatal("Failed to read seq name on line %d of file %s\n", efp->linenumber, keyfile);
      
      status = esl_keyhash_Store(keys, key, keylen, &keyidx);
      if (status == eslEDUP) esl_fatal("seq key %s occurs more than once in file %s\n", key, keyfile);
	
      /* if we have an SSI index, just fetch them as we go. */
      if (sqfp->data.ascii.ssi != NULL) { onefetch(go, ofp, key, sqfp);  nseq++; }
      nkeys++;
    }

  /* If we don't have an SSI index, we haven't fetched anything yet; do it now. */
  if (sqfp->data.ascii.ssi == NULL) 
    {
      ESL_SQ *sq     = esl_sq_Create();

      while ((status = esl_sqio_Read(sqfp, sq)) == eslOK)
	{
	  if ( (sq->name[0] != '\0' && esl_keyhash_Lookup(keys, sq->name, -1, NULL) == eslOK) ||
	       (sq->acc[0]  != '\0' && esl_keyhash_Lookup(keys, sq->acc,  -1, NULL) == eslOK))
	    {
	      if (esl_opt_GetBoolean(go, "-r") )
		if (esl_sq_ReverseComplement(sq) != eslOK) 
		  esl_fatal("Failed to reverse complement %s\n", sq->name);
	      esl_sqio_Write(ofp, sq, eslSQFILE_FASTA, FALSE);
	      nseq++;
	    }
	  esl_sq_Reuse(sq);
	}
      if      (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n",
					       sqfp->filename, esl_sqfile_GetErrorBuf(sqfp));
      else if (status != eslEOF)     esl_fatal("Unexpected error %d reading sequence file %s",
					       status, sqfp->filename);
      esl_sq_Destroy(sq);
    }
  
  if (nkeys != nseq) esl_fatal("Tried to retrieve %d keys, but only retrieved %d sequences\n", nkeys, nseq);

  if (ofp != stdout) printf("\nRetrieved %d sequences.\n", nseq);

  esl_keyhash_Destroy(keys);
  esl_fileparser_Close(efp);
  return;
}