Exemple #1
0
/* Function:  p7_coords2_hash_Reuse()
 * Synopsis:  Reuse a <P7_COORDS2>
 *
 * Purpose:   Clear a <P7_COORDS2_HASH> hash table for reuse.
 *
 *            If any allocations are overly large, drop them
 *            back to 'redline' values. Default redlines
 *            are 1024 keys (i.e. different coord pair arrays),
 *            1024 hash values, and 16384 total integers of
 *            raw data. Redlines are all 8x the default
 *            initial allocations.
 *
 * Args:      ch :  hash table to reuse
 *
 * Returns:   <eslOK> on success
 *
 * Throws:    <eslEMEM> on allocation failure.
 *            (But any reallocations here are shrinkages, so I don't
 *            believe they can fail.)
 */
int
p7_coords2_hash_Reuse(P7_COORDS2_HASH *ch)
{
  int hashsize_redline = 1024;
  int kalloc_redline   = 1024;
  int calloc_redline   = 16384;
  int i;
  int status;

  if (ch->hashsize > hashsize_redline)
    {
      ESL_REALLOC(ch->hashtable, sizeof(int32_t) * hashsize_redline);
      ch->hashsize = hashsize_redline;
    }
  if (ch->kalloc > kalloc_redline)
    { 
      ESL_REALLOC(ch->nxt,        sizeof(int32_t) * kalloc_redline);
      ESL_REALLOC(ch->key_offset, sizeof(int32_t) * kalloc_redline);
      ch->kalloc = kalloc_redline;
    }
  if (ch->calloc > calloc_redline)
    {
      ESL_REALLOC(ch->cmem, sizeof(int32_t) * ch->calloc);
      ch->calloc = calloc_redline;
    }

  for (i = 0; i < ch->hashsize; i++) ch->hashtable[i] = -1;
  ch->nkeys = 0;
  ch->cn    = 0;
  return eslOK;

 ERROR:
  return status;
}
Exemple #2
0
/* Function: esl_keyhash_Store()
 * Synopsis: Store a key and get a key index for it.
 *
 * Purpose:  Store a string <key> of length <n> in the key index hash table <kh>.
 *           Associate it with a unique key index, counting from
 *           0. It's this index that lets us map the hashed keys to
 *           integer-indexed C arrays, clumsily emulating Perl's
 *           hashes. Optionally returns the index through <opt_index>.
 *           
 *           <key>, <n> follow the standard idiom for strings and
 *           unterminated buffers.
 *
 * Returns:  <eslOK> on success; stores <key> in <kh>; <opt_index> is 
 *           returned, set to the next higher index value.
 *           Returns <eslEDUP> if <key> was already stored in the table;
 *           <opt_index> is set to the existing index for <key>.
 *
 * Throws:   <eslEMEM> on allocation failure, and sets <opt_index> to -1.
 */
int
esl_keyhash_Store(ESL_KEYHASH *kh, const char *key, esl_pos_t n, int *opt_index)
{
  uint32_t val = jenkins_hash(key, n, kh->hashsize);
  int idx;
  int status;
  
  if (n == -1) n = strlen(key);

  /* Was this key already stored?  */
  for (idx = kh->hashtable[val]; idx != -1; idx = kh->nxt[idx])
    if (esl_memstrcmp(key, n, kh->smem + kh->key_offset[idx]))
      { 
	if (opt_index != NULL) *opt_index = idx; 
	return eslEDUP; 
      }

  /* Reallocate key ptr/index memory if needed */
  if (kh->nkeys == kh->kalloc) 
    { 
      ESL_REALLOC(kh->key_offset, sizeof(int)*kh->kalloc*2);
      ESL_REALLOC(kh->nxt,        sizeof(int)*kh->kalloc*2);
      kh->kalloc *= 2;
    }

  /* Reallocate key string memory if needed */
  while (kh->sn + n + 1 > kh->salloc)
    {
      ESL_REALLOC(kh->smem, sizeof(char) * kh->salloc * 2);
      kh->salloc *= 2;
    }

  /* Copy the key, assign its index */
  idx                 = kh->nkeys;
  kh->key_offset[idx] = kh->sn;
  kh->sn             += n+1;
  esl_memstrcpy(key, n, kh->smem + kh->key_offset[idx]);
  kh->nkeys++;

  /* Insert new element at head of the approp linked list in hashtable */
  kh->nxt[idx]       = kh->hashtable[val];
  kh->hashtable[val] = idx;

  /* Time to upsize? If we're 3x saturated, expand the hash table */
  if (kh->nkeys > 3*kh->hashsize)
    if ((status = key_upsize(kh)) != eslOK) goto ERROR;

  if (opt_index != NULL) *opt_index = idx;
  return eslOK;

 ERROR:
  if (opt_index != NULL) *opt_index = -1;
  return status;
}
Exemple #3
0
/* p7_coords2_hash_upsize()
 * 
 * Increase the hash table size in <ch>, because it's getting
 * too full. This requires recalculating the hash functions for
 * all the previously stored keys, and re-storing them.
 *
 * Throws: <eslEMEM> on allocation failure.
 */
int
p7_coords2_hash_upsize(P7_COORDS2_HASH *ch)
{
  uint32_t val;
  int32_t  i;
  int      status;

  /* 28, because we're going to upsize in steps of 8x, 2^3, so need <2^(31-3) */
  if (ch->hashsize >= (1<<28)) return eslOK; /* quasi-success: don't grow any more */

  /* The catch: upsizing table changes all hash functions, so all
   * keys have to be re-hashed and re-stored. But they can stay
   * where they are in the data storage array.
   */
  ESL_REALLOC(ch->hashtable, sizeof(int32_t) * (ch->hashsize << 3));
  ch->hashsize = ch->hashsize << 3; /* x8 */
  for (i = 0; i < ch->hashsize; i++) 
    ch->hashtable[i] = -1;

  for (i = 0; i < ch->nkeys; i++)
    {
      val        = p7_coords2_hash_function_alt(ch->cmem + ch->key_offset[i], ch->hashsize);
      ch->nxt[i] = ch->hashtable[val];
      ch->hashtable[val] = i;
    }
  return eslOK;

 ERROR:
  return eslEMEM;
}
Exemple #4
0
/* Function:  p7_filtermx_GrowTo()
 * Synopsis:  Resize filter DP matrix for new profile size.
 *
 * Purpose:   Given an existing filter matrix structure <fx>,
 *            and the dimension <M> of a new profile that 
 *            we're going to use (in consensus positions),
 *            assure that <fx> is large enough for such a 
 *            profile; reallocate and reinitialize as needed.
 *
 *            <p7_filtermx_Reuse(fx); p7_filtermx_GrowTo(fx, M)>
 *            is essentially equivalent to <p7_filtermx_Create(M)>,
 *            while minimizing reallocation.
 *
 * Returns:   <eslOK> on success.
 *
 * Throws:    <eslEMEM> on allocation failure. The state of
 *            <fx> is now undefined, and it should not be used.
 */
int
p7_filtermx_GrowTo_avx(P7_FILTERMX *fx, int allocM)
{
  int status;

  /* Contract checks / argument validation */
  ESL_DASSERT1( (allocM >= 1 && allocM <= 100000) );
#ifdef HAVE_AVX2
  /* is it already big enough? */
  if (allocM <= fx->allocM_AVX) return eslOK;

  /* if not, grow it */
  ESL_REALLOC(fx->dp_mem_AVX, (sizeof(__m256i) * (p7F_NSCELLS * P7_NVW_AVX(allocM))) + (p7_VALIGN_AVX-1));
  fx->allocM_AVX = allocM;
  fx->dp_AVX     = (__m256i *) ( (unsigned long int) ( (char *) fx->dp_mem_AVX + (p7_VALIGN_AVX-1)) & p7_VALIMASK_AVX);

  return eslOK;

 ERROR:
  return status;
#endif //HAVE_AVX2
 #ifndef HAVE_AVX2
 return eslENORESULT;
 #endif   
}
Exemple #5
0
/* Function:  p7_anchors_Resize()
 * Synopsis:  Reallocate a P7_ANCHORS object, if necessary
 *
 * Purpose:   Make sure that <anch> can hold an array of
 *            at least <D> anchors.
 *
 *            Does not alter any data that are already stored
 *            in <anch>, so it's safe to resize an anchor
 *            array that we're growing incrementally (as in
 *            segmental divide and conquer MPAS algorithm).
 *
 *            D=0 is a valid argument and may occur in normal use; it
 *            results in a no-op, because the structure is always big
 *            enough to hold zero anchors.
 *
 * Xref:      First example of a new pattern for how we
 *            can handle reallocation/reuse strategy,
 *            replacing _Reinit() and _Grow() interfaces.
 *            [SRE:J14/1]
 */
int
p7_anchors_Resize(P7_ANCHORS *anch, int D)
{
    int nalloc;
    int status;

    /* Contract checks, argument validation */
    ESL_DASSERT1(( anch->nalloc > 0 ));

    if      (D+2 <= anch->nalloc) return eslOK;       // If we're big enough already, do nothing;
    else if (D+2 <  anch->nredline || anch->D > 0)    // If we're under the redline max, or if it looks like
    {   //   we're building the anchor array incrementally,
        nalloc = anch->nalloc;                        //   we reallocate by doubling, trying to minimize
        while (nalloc < D+2) nalloc *= 2;             //   the need for more reallocations soon.
    }                                               // If we're over redline AND it looks like we're
    else nalloc = D+2;                                //   starting an empty object, allocate exactly.
    //   Now nalloc will probably not be a multiple of two --
    //   but the next _Reuse() call will pull it back
    //   to the redline, which is.
    ESL_REALLOC(anch->a, sizeof(P7_ANCHOR) * nalloc);
    anch->nalloc = nalloc;
    return eslOK;

ERROR:
    return status;
}
Exemple #6
0
/* Function:  p7_hmm_mpi_Recv()
 * Synopsis:  Receives an HMM as a work unit from an MPI sender.
 *
 * Purpose:   Receive a work unit that consists of a single HMM
 *            sent by MPI <source> (<0..nproc-1>, or
 *            <MPI_ANY_SOURCE>) tagged as <tag> for MPI communicator <comm>.
 *            
 *            Work units are prefixed by a status code that gives the
 *            number of HMMs to follow; here, 0 or 1 (but in the future,
 *            we could easily extend to sending several HMMs in one 
 *            packed buffer). If we receive a 1 code and we successfully
 *            unpack an HMM, this routine will return <eslOK> and a non-<NULL> <*ret_hmm>.
 *            If we receive a 0 code (a shutdown signal), 
 *            this routine returns <eslEOD> and <*ret_hmm> is <NULL>.
 *   
 *            Caller provides a working buffer <*buf> of size
 *            <*nalloc> characters. These are passed by reference, so
 *            that <*buf> can be reallocated and <*nalloc> increased
 *            if necessary. As a special case, if <*buf> is <NULL> and
 *            <*nalloc> is 0, the buffer will be allocated
 *            appropriately, but the caller is still responsible for
 *            free'ing it.
 *            
 *            Caller may or may not already know what alphabet the HMM
 *            is expected to be in.  A reference to the current
 *            alphabet is passed in <byp_abc>. If the alphabet is unknown,
 *            pass <*byp_abc = NULL>, and when the HMM is received, an
 *            appropriate new alphabet object is allocated and passed
 *            back to the caller via <*abc>.  If the alphabet is
 *            already known, <*byp_abc> is that alphabet, and the new
 *            HMM's alphabet type is verified to agree with it. This
 *            mechanism allows an application to let the first HMM
 *            determine the alphabet type for the application, while
 *            still keeping the alphabet under the application's scope
 *            of control.
 *
 * Args:      source  - index of MPI sender, 0..nproc-1 (0=master), or MPI_ANY_SOURCE
 *            tag     - MPI message tag;  MPI_ANY_TAG, or a specific message tag (0..32767 will work on any MPI)
 *            comm    - MPI communicator; MPI_COMM_WORLD, or a specific MPI communicator
 *            buf     - working buffer (for receiving packed message);
 *                      if <*buf> == NULL, a <*buf> is allocated and returned;
 *                      if <*buf> != NULL, it is used (and may be reallocated)
 *            nalloc  - allocation size of <*buf> in bytes; pass 0 if <*buf==NULL>.           
 *            byp_abc - BYPASS: <*byp_abc> == ESL_ALPHABET *> if known;
 *                              <*byp_abc> == NULL> if alphabet unknown.
 *            ret_hmm  - RETURN: newly allocated/received profile
 *
 * Returns:   <eslOK> on success. <*ret_hmm> contains the received HMM;
 *            it is allocated here, and the caller is responsible for
 *            free'ing it.  <*buf> may have been reallocated to a
 *            larger size, and <*nalloc> may have been increased.  If
 *            <*abc> was passed as <NULL>, it now points to an
 *            <ESL_ALPHABET> object that was allocated here; caller is
 *            responsible for free'ing this.
 *            
 *            Returns <eslEOD> if an end-of-data signal was received.
 *            In this case, <*buf>, <*nalloc>, and <*abc> are left unchanged,
 *            and <*ret_hmm> is <NULL>.
 *            
 *            Returns <eslEINCOMPAT> if the HMM is in a different alphabet
 *            than <*abc> said to expect. In this case, <*abc> is unchanged,
 *            <*buf> and <*nalloc> may have been changed, and <*ret_hmm> is
 *            <NULL>.
 *            
 * Throws:    <eslEMEM> on allocation error, and <eslESYS> on MPI communication
 *            errors; in either case <*ret_hmm> is <NULL>.           
 */
int
p7_hmm_mpi_Recv(int source, int tag, MPI_Comm comm, char **buf, int *nalloc, ESL_ALPHABET **byp_abc, P7_HMM **ret_hmm)
{
  int         pos = 0;
  int         code;
  int         n;
  MPI_Status  mpistatus;
  int         status;

  /* Probe first, because we need to know if our buffer is big enough. */
  if ( MPI_Probe(source, tag, comm, &mpistatus)  != MPI_SUCCESS) ESL_EXCEPTION(eslESYS, "mpi probe failed");
  if ( MPI_Get_count(&mpistatus, MPI_PACKED, &n) != MPI_SUCCESS) ESL_EXCEPTION(eslESYS, "mpi get count failed");

  /* Make sure the buffer is allocated appropriately */
  if (*buf == NULL || n > *nalloc) 
    {
      ESL_REALLOC(*buf, sizeof(char) * n);
      *nalloc = n; 
    }

  /* Receive the entire packed work unit */
  if (MPI_Recv(*buf, n, MPI_PACKED, source, tag, comm, &mpistatus) != MPI_SUCCESS) ESL_EXCEPTION(eslESYS, "mpi recv failed");

  /* Unpack the status code prefix */
  if (MPI_Unpack(*buf, n, &pos, &code, 1, MPI_INT, comm) != MPI_SUCCESS) ESL_EXCEPTION(eslESYS, "mpi unpack failed");

  if      (code == 0) { status = eslEOD; *ret_hmm = NULL; }
  else if (code == 1)   status = p7_hmm_mpi_Unpack(*buf, *nalloc, &pos, comm, byp_abc, ret_hmm);
  else                  ESL_EXCEPTION(eslESYS, "bad mpi buffer transmission code");
  return status;

 ERROR: /* from ESL_REALLOC only */
  *ret_hmm = NULL;
  return status;
}
/* Function:  p7_filtermx_GrowTo()
 * Synopsis:  Resize filter DP matrix for new profile size.
 *
 * Purpose:   Given an existing filter matrix structure <fx>,
 *            and the dimension <M> of a new profile that 
 *            we're going to use (in consensus positions),
 *            assure that <fx> is large enough for such a 
 *            profile; reallocate and reinitialize as needed.
 *
 *            <p7_filtermx_Reuse(fx); p7_filtermx_GrowTo(fx, M)>
 *            is essentially equivalent to <p7_filtermx_Create(M)>,
 *            while minimizing reallocation.
 *
 * Returns:   <eslOK> on success.
 *
 * Throws:    <eslEMEM> on allocation failure. The state of
 *            <fx> is now undefined, and it should not be used.
 */
int
p7_filtermx_GrowTo_neon64(P7_FILTERMX *fx, int allocM)
{
#ifdef HAVE_NEON64  
  int status;

  /* Contract checks / argument validation */
  ESL_DASSERT1( (allocM >= 1 && allocM <= 100000) );

  if (allocM <= fx->allocM) return eslOK;

  /* if not, grow it */
  ESL_REALLOC(fx->dp_mem, (sizeof(esl_neon_128i_t) * (p7F_NSCELLS * P7_NVW(allocM))) + (p7_VALIGN-1));
  fx->allocM = allocM;
  fx->dp     = (esl_neon_128i_t *) ( (unsigned long int) ( (char *) fx->dp_mem + (p7_VALIGN-1)) & p7_VALIMASK);

  return eslOK;

 ERROR:
  return status;
#endif //HAVE_NEON64
 #ifndef HAVE_NEON64
 return eslENORESULT;
 #endif   
}
Exemple #8
0
/* Function:  p7_hmm_mpi_Send()
 * Synopsis:  Send an HMM as an MPI work unit.
 *
 * Purpose:   Sends an HMM <hmm> as a work unit to MPI process
 *            <dest> (where <dest> ranges from 0..<nproc-1>), tagged
 *            with MPI tag <tag>, for MPI communicator <comm>, as 
 *            the sole workunit or result. 
 *            
 *            Work units are prefixed by a status code indicating the
 *            number of HMMs sent. If <hmm> is <NULL>, this code is 0,
 *            and <_Recv()> interprets such a unit as an EOD
 *            (end-of-data) signal, a signal to cleanly shut down
 *            worker processes.
 *            
 *            In order to minimize alloc/free cycles in this routine,
 *            caller passes a pointer to a working buffer <*buf> of
 *            size <*nalloc> characters. If necessary (i.e. if <hmm> is
 *            too big to fit), <*buf> will be reallocated and <*nalloc>
 *            increased to the new size. As a special case, if <*buf>
 *            is <NULL> and <*nalloc> is 0, the buffer will be
 *            allocated appropriately, but the caller is still
 *            responsible for free'ing it.
 *            
 * Returns:   <eslOK> on success; <*buf> may have been reallocated and
 *            <*nalloc> may have been increased.
 * 
 * Throws:    <eslESYS> if an MPI call fails; <eslEMEM> if a malloc/realloc
 *            fails. In either case, <*buf> and <*nalloc> remain valid and useful
 *            memory (though the contents of <*buf> are undefined). 
 * 
 * Note:      Compare to p7_hmmfile_WriteBinary(). The two operations (sending
 *            an HMM via MPI, or saving it as a binary file to disk) are
 *            similar.
 */
int
p7_hmm_mpi_Send(const P7_HMM *hmm, int dest, int tag, MPI_Comm comm, char **buf, int *nalloc)
{
  int   n = 0;
  int   code;
  int   sz, pos;
  int   status;

  /* Figure out size. We always send at least a status code (0=EOD=nothing sent) */
  if ( MPI_Pack_size(1, MPI_INT, comm, &sz)          != MPI_SUCCESS) ESL_EXCEPTION(eslESYS, "mpi pack size failed");  n += sz;
  if ((status = p7_hmm_mpi_PackSize(hmm, comm, &sz)) != eslOK)       return status;                                   n += sz;

  /* Make sure the buffer is allocated appropriately */
  if (*buf == NULL || n > *nalloc) 
    {
      ESL_REALLOC(*buf, sizeof(char) * n);
      *nalloc = n; 
    }

  /* Pack the status code and HMM into the buffer */
  /* The status code is the # of HMMs being sent as one MPI message; here 1 or 0 */
  pos  = 0;
  code = (hmm ? 1 : 0);
  if (MPI_Pack(&code, 1, MPI_INT,           *buf, n, &pos, comm)  != MPI_SUCCESS) ESL_EXCEPTION(eslESYS, "mpi pack failed");
  if (hmm && (status = p7_hmm_mpi_Pack(hmm, *buf, n, &pos, comm)) != eslOK)       return status;
  
  /* Send the packed HMM to the destination. */
  if (MPI_Send(*buf, n, MPI_PACKED, dest, tag, comm) != MPI_SUCCESS)  ESL_EXCEPTION(eslESYS, "mpi send failed");
  return eslOK;

 ERROR:
  return status;
}
Exemple #9
0
int
p7_masstrace_GrowTo(P7_MASSTRACE *mt, int M, int L)
{
  int status;
  
  if (mt->imass && mt->ialloc < L+2) {
    ESL_REALLOC(mt->imass, sizeof(float) * (L+2));
    mt->ialloc = L+2;
  }
  if (mt->kalloc < M+2) {
    ESL_REALLOC(mt->kmass, sizeof(float) * (M+2));
    mt->kalloc = M+2;
  }
  return eslOK;

 ERROR:
  return status;
}
/* Function:  p7_sparsemask_Reinit()
 * Synopsis:  Reinitialize an existing P7_SPARSEMASK for a new comparison.
 *
 * Purpose:   Same as a <_Create()>, but reusing an existing 
 *            <P7_SPARSEMASK> to minimize reallocation calls.
 *
 * Returns:   <eslOK> on success.
 *
 * Throws:    <eslEMEM> on allocation failure.
 */
int
p7_sparsemask_Reinit_avx512(P7_SPARSEMASK *sm, int M, int L)
{
 #ifdef HAVE_AVX512 
  int i,r;
  int status;

  sm->L  = L;
  sm->M  = M;
  sm->Q_AVX_512  = P7_NVF_AVX_512(M);
    
  /* seg[], kmem stay at their previous salloc, kalloc
   * but do we need to reallocate rows for k[] and n[]? 
   */
  if (sm->ralloc_AVX_512 < L+1) {
    ESL_REALLOC(sm->k_AVX_512, sizeof(int *) * (L+1));
    ESL_REALLOC(sm->n_AVX_512, sizeof(int)   * (L+1));
    sm->ralloc_AVX_512 = L+1;
    sm->n_rrealloc++;
  }

  sm->S_AVX_512       = 0;
  sm->nrow_AVX_512    = 0;
  sm->ncells_AVX_512  = 0;
  sm->last_i_AVX_512  = sm->L+1;
  for (r = 0; r < p7_VNF_AVX_512; r++) 
    sm->last_k_AVX_512[r]  = -1; 
  /* sn[] are initialized for each sparse row by _StartRow() */

  /* The realloc counters are NOT reset. They keep accumulating during
   * the life of the object. 
   */
  for (i = 1; i <= L; i++)  /* n[0] will always be 0, but reinit n[1..L] */
    sm->n_AVX_512[i] = 0;

  return eslOK;

 ERROR:
  return status;
#endif //HAVE_AVX512
#ifndef HAVE_AVX512
return eslENORESULT;
#endif  
}
Exemple #11
0
int
p7_gmxb_Reinit(P7_GMXB *gxb, P7_GBANDS *bnd)
{
    int status;

    if (bnd->ncell > gxb->dalloc) {
        ESL_REALLOC(gxb->dp,  sizeof(float) * bnd->ncell * p7G_NSCELLS);
        gxb->dalloc = bnd->ncell;
    }

    if (bnd->nrow  > gxb->xalloc) {
        ESL_REALLOC(gxb->xmx, sizeof(float) * bnd->nrow  * p7G_NXCELLS);
        gxb->xalloc = bnd->nrow;
    }

    gxb->bnd = bnd;
    return eslOK;

ERROR:
    return status;
}
Exemple #12
0
int
p7_gbands_GrowRows(P7_GBANDS *bnd)
{
  int new_rowalloc = bnd->rowalloc * 2;
  int status;

  ESL_REALLOC(bnd->kmem, sizeof(int) * new_rowalloc * p7_GBANDS_NK);
  bnd->rowalloc = new_rowalloc;
  return eslOK;

 ERROR:
  return status;
}
Exemple #13
0
int
p7_gbands_GrowSegs(P7_GBANDS *bnd)
{
  int new_segalloc = bnd->segalloc * 2; /* grow by doubling */
  int status;

  ESL_REALLOC(bnd->imem, sizeof(int) * new_segalloc * 2);
  bnd->segalloc = new_segalloc;
  return eslOK;
  
 ERROR:
  return status;
}
Exemple #14
0
int
p7_coords2_GrowTo(P7_COORDS2 *c2, int32_t nalloc)
{
  int status;

  if (c2->nalloc >= nalloc) return eslOK;

  ESL_REALLOC(c2->arr, sizeof(P7_COORD2) * nalloc);
  c2->nalloc = nalloc;
  return eslOK;

 ERROR:
  return status;
}
Exemple #15
0
/* Function:  p7_coords2_Grow()
 * Synopsis:  Increase allocation for coord pairs, if needed.
 *
 * Purpose:   Check if there's enough space in <c2> to hold
 *            a new coord pair. If not, increase the allocation
 *            in <c2> by doubling it.
 *
 * Args:      c2  : coord pair array
 *
 * Returns:   <eslOK> on success.
 *
 * Throws:    <eslEMEM> on allocation failure.
 */
int
p7_coords2_Grow(P7_COORDS2 *c2)
{
  int status;

  if (c2->n < c2->nalloc) return eslOK;

  ESL_REALLOC(c2->arr, sizeof(P7_COORD2) * c2->nalloc * 2);
  c2->nalloc = c2->nalloc * 2;
  return eslOK;

 ERROR:
  return status;
}
Exemple #16
0
/* Function:  p7_anchorhash_Reuse()
 * Synopsis:  Reuse a <P7_ANCHORHASH>
 *
 * Purpose:   Clear a <P7_ANCHORHASH> hash table for reuse.
 *
 *            If any allocations are overly large, drop them
 *            back to 'redline' values. Default redlines
 *            are 1024 keys (i.e. different anchor sets)
 *            1024 hash values, and 16384 total integers of
 *            raw data. Redlines are all 8x the default
 *            initial allocations.
 *
 * Args:      ah :  hash table to reuse
 *
 * Returns:   <eslOK> on success
 *
 * Throws:    <eslEMEM> on allocation failure.
 *            (But any reallocations here are shrinkages, so I don't
 *            believe they can fail.)
 */
int
p7_anchorhash_Reuse(P7_ANCHORHASH *ah)
{
  int hashsize_redline = 1024;
  int kalloc_redline   = 1024;
  int aalloc_redline   = 16384;
  int i;
  int status;

  if (ah->hashsize > hashsize_redline)
    {
      ESL_REALLOC(ah->hashtable, sizeof(int32_t) * hashsize_redline);
      ah->hashsize = hashsize_redline;
    }
  if (ah->kalloc > kalloc_redline)
    { 
      ESL_REALLOC(ah->nxt,        sizeof(int32_t) * kalloc_redline);
      ESL_REALLOC(ah->key_offset, sizeof(int32_t) * kalloc_redline);
      ESL_REALLOC(ah->key_count,  sizeof(int32_t) * kalloc_redline);
      ah->kalloc = kalloc_redline;
    }
  if (ah->aalloc > aalloc_redline)
    {
      ESL_REALLOC(ah->amem, sizeof(int32_t) * ah->aalloc);
      ah->aalloc = aalloc_redline;
    }

  for (i = 0; i < ah->hashsize; i++) ah->hashtable[i] = -1;
  ah->L     = 0;
  ah->M     = 0;
  ah->nkeys = 0;
  ah->an    = 0;
  return eslOK;

 ERROR:
  return status;
}
Exemple #17
0
int
p7_coords2_Reuse(P7_COORDS2 *c2)
{
  int status;

  if (c2->nalloc > c2->nredline) {
    ESL_REALLOC(c2->arr, sizeof(P7_COORD2) * c2->nredline);
    c2->nalloc = c2->nredline;
  }
  c2->n    = 0;
  return eslOK;

 ERROR:
  return status;
}
Exemple #18
0
int
p7_anchors_Reuse(P7_ANCHORS *anch)
{
    int status;

    if (anch->nalloc > anch->nredline) {
        ESL_REALLOC(anch->a, sizeof(P7_ANCHOR) * anch->nredline);
        anch->nalloc = anch->nredline;
    }
    anch->D = 0;
    return eslOK;

ERROR:
    return status;
}
int
p7_sparsemask_StartRow_avx512(P7_SPARSEMASK *sm, int i)
{
 #ifdef HAVE_AVX512 
  int r;
  int status;
  
#ifdef p7_DEBUGGING
  if (i < 1 || i > sm->L) ESL_EXCEPTION(eslEINVAL, "i is 1..L: sequence position");
  if (sm->last_i <= i)    ESL_EXCEPTION(eslEINVAL, "rows need to be added in reverse order L..1");
#endif

  /* Make sure kmem has enough memory; if not, double it.
   * Because we know the original allocation was enough to hold
   * the slots, we know that doubling (even if ncells has filled
   * the current kalloc) is sufficient.
   */
  if (sm->ncells_AVX_512 + p7_VNF_AVX_512*sm->Q_AVX_512 > sm->kalloc_AVX_512)
    {
      int64_t kalloc_req = sm->kalloc_AVX_512 * 2;
      ESL_REALLOC(sm->kmem_AVX_512, sizeof(int) * kalloc_req);
      sm->kalloc_AVX_512 = kalloc_req;
      sm->n_krealloc++;
    }
  
  for (r = 0; r < p7_VNF_AVX_512; r++)
    {
      sm->s_AVX_512[p7_VNF_AVX_512-r-1] = sm->kmem_AVX_512 + sm->ncells_AVX_512 + r*sm->Q_AVX_512;
      sm->sn_AVX_512[r]         = 0;
    }
  sm->last_i_AVX_512 = i;
  for (r = 0; r < p7_VNF_AVX_512; r++) 
    sm->last_k_AVX_512[r] = sm->M+1;    /* sentinel to be sure that Add() is called in reverse order M..1 */
  return eslOK;
  
 ERROR:
  return status;
  #endif //HAVE_AVX512
  #ifndef HAVE_AVX512
  return eslENORESULT;
  #endif
}
Exemple #20
0
/* Function:  p7_hit_Grow()
 * Synopsis:  Change the allocation of a P7_HIT array.
 *
 * Purpose:   Given a ptr <*hitp> to a <P7_HIT> array,
 *            the old allocation size <oldalloc>, and
 *            a new allocation size <newalloc>;
 *            reallocate the array <*hitp>.
 *
 * Returns:   <eslOK> on success; <*hitp> may have moved.
 *
 * Throws:    <eslEMEM> on reallocation failure. <*hitp> is
 *            unchanged, as is the array's contents.
 */
int
p7_hit_Grow(P7_HIT **hitp, int oldalloc, int newalloc)
{
  int h;
  int status;

  ESL_REALLOC( (*hitp), sizeof(P7_HIT) * newalloc);
  for (h = oldalloc; h < newalloc; h++)
    {
      (*hitp)[h].name = NULL;
      (*hitp)[h].acc  = NULL;
      (*hitp)[h].desc = NULL;
      (*hitp)[h].dcl  = NULL;
      (*hitp)[h].ndom = 0;
    }
  return eslOK;

 ERROR:
  return status;
}
Exemple #21
0
/* Sample random domain segment positions, start/end pairs, sorted and nonoverlapping.
 */
int
p7_coords2_Sample(ESL_RANDOMNESS *rng, P7_COORDS2 *c2, int32_t maxseg, int32_t L, int32_t **byp_wrk)
{
  int32_t *wrk  = NULL;
  int32_t  nseg = 1 + esl_rnd_Roll(rng, maxseg); /* 1..maxseg */
  int32_t  i;
  int      status;

  /* Using the bypass idiom, make sure we have a workspace for <L> coords */
  if      (esl_byp_IsInternal(byp_wrk) ) ESL_ALLOC(wrk, sizeof(int32_t) * L);
  else if (esl_byp_IsReturned(byp_wrk) ) ESL_ALLOC(wrk, sizeof(int32_t) * L);
  else if (esl_byp_IsProvided(byp_wrk) ) { wrk = *byp_wrk; ESL_REALLOC(wrk, sizeof(int32_t) * L); }
			      
  /* We put the numbers 1..L into the workspace <wrk>; shuffle them;
   * then sort the top nseg*2 of them. This gives us <nseg>
   * nonoverlapping start/end coords, in order.
   */
  for (i = 0; i < L; i++) wrk[i] = i+1;
  esl_vec_IShuffle(rng, wrk, L);
  esl_vec_ISortIncreasing(wrk, nseg*2);

  /* Store those randomized coords now in the data structure. */
  p7_coords2_GrowTo(c2, nseg);
  c2->n    = nseg;
  for (i = 0; i < nseg; i++)
    {
      c2->arr[i].n1 = wrk[i*2];
      c2->arr[i].n2 = wrk[i*2+1];
    }
  
  /* Using the bypass idiom, recycle workspace, if we're supposed to */
  if      (esl_byp_IsInternal(byp_wrk)) free(wrk);
  else if (esl_byp_IsReturned(byp_wrk)) *byp_wrk = wrk;
  else if (esl_byp_IsProvided(byp_wrk)) *byp_wrk = wrk;
  return eslOK;

 ERROR:
  if (esl_byp_IsInternal(byp_wrk) && wrk) free(wrk);
  return status;
}
/* Function:  allocateSeqdata()
 * Synopsis:  ensure that space is allocated for the seqdata object
 *            in the FM-index metadata.
 */
int
allocateSeqdata (FM_METADATA *meta, ESL_SQ *sq, int numseqs, int *allocedseqs) {
  int length;
  int status = eslOK;


  if (numseqs == *allocedseqs) { // either first allocation, or increase in size
    *allocedseqs *= 4; // we've bumped up against allocation limit, double allocation.
    ESL_REALLOC (meta->seq_data, *allocedseqs * sizeof(FM_SEQDATA));
    if (meta->seq_data == NULL )
      esl_fatal("unable to allocate memory to store FM meta data\n");
  }

  //allocate space for the name, source, acc, and desc of the sequence source for the block
  length = strlen(sq->name);
  meta->seq_data[numseqs].name_length = length;
  ESL_ALLOC (meta->seq_data[numseqs].name, (1+length) * sizeof(char));

  length = strlen(sq->acc);
  meta->seq_data[numseqs].acc_length = length;
  ESL_ALLOC (meta->seq_data[numseqs].acc, (1+length) * sizeof(char));

  length = strlen(sq->source);
  meta->seq_data[numseqs].source_length = length;
  ESL_ALLOC (meta->seq_data[numseqs].source, (1+length) * sizeof(char));

  length = strlen(sq->desc);
  meta->seq_data[numseqs].desc_length = length;
  ESL_ALLOC (meta->seq_data[numseqs].desc, (1+length) * sizeof(char));


  if (meta->seq_data[numseqs].name == NULL || meta->seq_data[numseqs].acc == NULL || meta->seq_data[numseqs].source == NULL || meta->seq_data[numseqs].desc == NULL)
    esl_fatal("unable to allocate memory to store FM meta data\n");

  return eslOK;

ERROR:
  return status;
}
/* Function:  p7_checkptmx_GrowTo()
 * Synopsis:  Resize checkpointed DP matrix for new seq/model comparison.
 *
 * Purpose:   Given an existing checkpointed matrix structure <ox>,
 *            and the dimensions <M> and <L> of a new comparison,
 *            reallocate and reinitialize <ox>.
 *
 *            Essentially the same as free'ing the previous matrix and
 *            creating a new one -- but minimizes expensive memory
 *            allocation/reallocation calls.
 *            
 *            Usually <ox> only grows. The exception is if <ox> is
 *            redlined (over its recommended allocation) and the new
 *            problem size <M,L> can fit in the preset recommended
 *            allocation, then <ox> is reallocated down to the smaller
 *            recommended size.
 *            
 * Args:      ox    - existing checkpointed matrix
 *            M     - new query profile length
 *            L     - new target sequence length         
 * 
 * Returns:   <eslOK> on success.
 *
 * Throws:    <eslEMEM> if an allocation fails. The state of <ox> is
 *            now undefined, and the caller should not use it. 
 */
int
p7_checkptmx_GrowTo(P7_CHECKPTMX *ox, int M, int L)
{
  int     minR_chk      = (int) ceil(minimum_rows(L)) + ox->R0; /* minimum number of DP rows needed  */
  int     reset_dp_ptrs = FALSE;
  int     maxR;
  int64_t W;			/* minimum row width needed, bytes */
  int     r;
  int     status;

  /* Validity of integer variable ranges may depend on design spec:                  */
  ESL_DASSERT1( (M <= 100000) );       /* design spec says, model length M <= 100000 */
  ESL_DASSERT1( (L <= 100000) );       /*           ... and,  seq length L <= 100000 */
  ESL_DASSERT1( (L >  0) );
  ESL_DASSERT1( (M >  0) );

  /* If we're debugging and we have stored copies of any matrices,
   * grow them too.  Must do this first, because we have an early exit
   * condition coming below.
   */
#ifdef p7_DEBUGGING
  if (ox->fwd && (status = p7_refmx_GrowTo(ox->fwd, M, L)) != eslOK) goto ERROR;
  if (ox->bck && (status = p7_refmx_GrowTo(ox->bck, M, L)) != eslOK) goto ERROR;
  if (ox->pp  && (status = p7_refmx_GrowTo(ox->pp,  M, L)) != eslOK) goto ERROR;
#endif

  /* Calculate W, the minimum row width needed, in bytes */
  W  = sizeof(float) * P7_NVF(M) * p7C_NSCELLS * p7_VNF;     /* vector part of row (MDI)     */
  W += ESL_UPROUND(sizeof(float) * p7C_NXCELLS, p7_VALIGN);  /* float part of row (specials); must maintain p7_VALIGN-byte alignment */

  /* Are current allocations satisfactory ? */
  if (W <= ox->allocW && ox->nalloc <= ox->ramlimit)
    {
      if      (L + ox->R0 <= ox->validR) { set_full        (ox, L);             return eslOK; }
      else if (minR_chk   <= ox->validR) { set_checkpointed(ox, L, ox->validR); return eslOK; }
    }

  /* Do individual matrix rows need to expand? */
  if ( W > ox->allocW) 
    {
      ox->allocW    = W;
      ox->validR    = (int) (ox->nalloc / ox->allocW); /* validR must be <= allocR */
      reset_dp_ptrs = TRUE;
    }

  /* Does matrix dp_mem need reallocation, either up or down? */
  maxR  = (int) (ox->nalloc / ox->allocW);                      /* max rows if we use up to the recommended allocation size.      */
  if ( (ox->nalloc > ox->ramlimit && minR_chk <= maxR) ||       /* we were redlined, and recommended alloc will work: so downsize */
       minR_chk > ox->validR)				        /* not enough memory for needed rows: so upsize                   */
    {
      set_row_layout(ox, L, maxR); 
      ox->validR = ox->R0 + ox->Ra + ox->Rb + ox->Rc;   /* this may be > allocR now; we'll reallocate dp[] next, if so     */
      ox->nalloc = ox->validR * ox->allocW;
      ESL_REALLOC(ox->dp_mem, ox->nalloc + (p7_VALIGN-1)); /* (p7_VALIGN-1) because we will manually align dpf ptrs into dp_mem */
      reset_dp_ptrs = TRUE;
    }
  else  /* current validR will suffice, either full or checkpointed; we still need to calculate a layout */
    {
      if   (L+ox->R0 <= ox->validR) set_full(ox, L); 
      else                          set_checkpointed(ox, L, ox->validR);
    }
  
  /* Does the array of row ptrs need reallocation? */
  if (ox->validR > ox->allocR)
    {
      ESL_REALLOC(ox->dpf, sizeof(float *) * ox->validR);
      ox->allocR    = ox->validR;
      reset_dp_ptrs = TRUE;
    }

  /* Do the row ptrs need to be reset? */
  if (reset_dp_ptrs)
    {
      ox->dpf[0] = (char *) ( ( (uintptr_t) ox->dp_mem + p7_VALIGN - 1) & p7_VALIMASK); /* vectors must be aligned on p7_VALIGN-byte boundary */
      for (r = 1; r < ox->validR; r++)
	ox->dpf[r] = ox->dpf[0] + (r * ox->allocW);
    }

  return eslOK;

 ERROR:
  return status;
}
Exemple #24
0
/* Function:  p7_anchorhash_Store()
 * Synopsis:  Store a <P7_ANCHORS> array and get a key index for it.
 *
 * Purpose:   Try to store anchor set <anch> in hash table <ah>.
 *            Associate it with a unique key index, counting from
 *            0. This index lets us map the hashed data to
 *            integer-based C arrays. Return the index through
 *            <opt_index>.
 *            
 *            <D0> allows us to store suffixes, supporting the
 *            segmental divide and conquer version of the MPAS
 *            algorithm. Do not store the first <D0> anchors; only
 *            store <D0+1..D>. To store the complete anchor set, pass
 *            <D0=0>.
 *
 *            If an identical anchor set is already stored in <ah>,
 *            set <*opt_index> to the key for that anchor set, and
 *            return <eslEDUP>.
 *            
 *            Increment <ah->key_count[]> counter every time we call
 *            <_Store()> on a given anchorset suffix (not counting
 *            D0). This collects the observed frequency of sampling
 *            the anchorset suffix, which we can compare to its
 *            calculated probability.
 *
 * Args:      ah         : hash table holding different anchor sets
 *            anch       : new anchor set to try to store
 *            D0         : ignore first <D0> anchors, store <D0+1..D> (0 = store all)
 *            opt_index  : optRETURN: index of stored data
 *            
 * Returns:   <eslOK> if <anch> is new; the anchor set data are stored, 
 *            and <opt_index>, if requested, is set to the lookup 
 *            key index for the stored data.
 *            
 *            <eslEDUP> if this anchor set has already been stored before;
 *            <opt_index>, if requested, is set to the lookup key
 *            index of the previously stored data.
 *
 * Throws:    <eslEMEM> on allocation failure. 
 */
int
p7_anchorhash_Store(P7_ANCHORHASH *ah, const P7_ANCHORS *anch, int D0, int32_t *opt_index)
{
  uint32_t  val = anchorhash_function(anch->a + D0, anch->D - D0, ah->hashsize);
  int32_t  *ptr;
  int32_t   idx;
  int32_t   d;
  int       status;
  
  /* Was this key already stored? */
  for (idx = ah->hashtable[val]; idx != -1; idx = ah->nxt[idx])
    {
      if (anchorhash_compare(anch->a + D0, anch->D - D0, ah->amem + ah->key_offset[idx]) == eslOK)
	{
	  ah->key_count[idx]++;
	  if (opt_index) *opt_index = idx;
	  return eslEDUP;
	}
    }

  /* Reallocate key memory if needed */
  if (ah->nkeys == ah->kalloc)
    {
      ESL_REALLOC(ah->key_offset, sizeof(int32_t) * ah->kalloc * 2);
      ESL_REALLOC(ah->key_count,  sizeof(int32_t) * ah->kalloc * 2);
      ESL_REALLOC(ah->nxt,        sizeof(int32_t) * ah->kalloc * 2);
      ah->kalloc *= 2;
    }

  /* Reallocate key data memory if needed (by doubling) */
  while (ah->an + 2 * (anch->D - D0) + 1 > ah->aalloc)
    {
      ESL_REALLOC(ah->amem, sizeof(int32_t) * ah->aalloc * 2);
      ah->aalloc *= 2;
    }

  /* Copy the key, assign its index */
  idx                 = ah->nkeys;
  ah->key_offset[idx] = ah->an;
  ah->key_count[idx]  = 1;                      // Not ++. This is an initialization.
  ah->an             += 2 * (anch->D - D0) + 1;
  ah->nkeys++;

  ptr  = ah->amem + ah->key_offset[idx];
  *ptr = anch->D - D0;
  for (d = D0 + 1; d <= anch->D; d++) 
    {
      ptr++; *ptr = anch->a[d].i0;
      ptr++; *ptr = anch->a[d].k0;
    }
  
  /* anchorhash needs to remember L,M so when caller asks
   * to _Get() an anchor set, anchorhash can set the sentinels
   * correctly. Fortunately even when we're only storing a 
   * suffix of <anch>, we still get the whole <anch> object,
   * which has valid sentinels, so we can deduce from them
   * what L,M are.
   */
  if (ah->nkeys == 1) 
    p7_anchor_GetSentinels(anch->a, anch->D, &(ah->L), &(ah->M));
  ESL_DASSERT1(( anch->a[anch->D+1].i0 = ah->L+1 ));
  ESL_DASSERT1(( anch->a[0].k0         = ah->M+1 ));

  /* Insert new element at head of the approp chain in hashtable */
  ah->nxt[idx]       = ah->hashtable[val];
  ah->hashtable[val] = idx;

  /* Time to upsize? If we're 3x saturated, expand the hash table */
  if (ah->nkeys > 3 * ah->hashsize)
    if ((status = anchorhash_upsize(ah)) != eslOK) goto ERROR;

  if (opt_index) *opt_index = idx;
  return eslOK;

 ERROR:
  if (opt_index) *opt_index = -1;
  return status;

}
int
p7_sparsemask_Finish_avx512(P7_SPARSEMASK *sm)
{
#ifdef HAVE_AVX512  
  int i,r;
  int s;
  int status;
//printf("calling p7_sparsemask_Finish, sm->ncells = %li\n", sm->ncells);

  /* Reverse kmem. */

  int *p_AVX_512;
  esl_vec_IReverse(sm->kmem_AVX_512, sm->kmem_AVX_512, sm->ncells_AVX_512);

  /* Set the k[] pointers; count <S> and <nrow> */
  p_AVX_512 = sm->kmem_AVX_512;
  sm->S_AVX_512 = sm->nrow_AVX_512 = 0;

  for (i = 1; i <= sm->L; i++){
     if (sm->n_AVX_512[i]) 
      {
  sm->nrow_AVX_512++;
  sm->k_AVX_512[i] = p_AVX_512;
  p_AVX_512       += sm->n_AVX_512[i];
  if (sm->n_AVX_512[i-1] == 0) sm->S_AVX_512++;
      } 
    else 
      sm->k_AVX_512[i] = NULL;
  }

 
  /* Reallocate seg[] if needed. */

  if ( (sm->S_AVX_512+2) > sm->salloc_AVX_512) 
    {
      ESL_REALLOC(sm->seg_AVX_512, (sm->S_AVX_512+2) * sizeof(p7_sparsemask_seg_s)); /* +2, for sentinels */
      sm->salloc_AVX_512 = sm->S_AVX_512 + 2; // inclusive of sentinels
      sm->n_srealloc++;
    }
      
  /* Set seg[] coord pairs. */
  sm->seg_AVX_512[0].ia = sm->seg_AVX_512[0].ib = -1;
  for (s = 1, i = 1; i <= sm->L; i++)
    {
      if (sm->n_AVX_512[i]   && sm->n_AVX_512[i-1] == 0)                 sm->seg_AVX_512[s].ia   = i; 
      if (sm->n_AVX_512[i]   && (i == sm->L || sm->n_AVX_512[i+1] == 0)) sm->seg_AVX_512[s++].ib = i; 
    }
  ESL_DASSERT1(( s == sm->S_AVX_512+1 ));
  sm->seg_AVX_512[s].ia = sm->seg_AVX_512[s].ib = sm->L+2;

   sm->last_i_AVX_512 = -1;
  for (r = 0; r < p7_VNF_AVX_512; r++) 
    sm->last_k_AVX_512[r] = -1;
 
 // if we're running AVX-512 code and not SSE, need to copy some values into the SSE data structure
  // so the downstream code will see them
  sm->seg = sm->seg_AVX_512;
  sm->k = sm->k_AVX_512;
  sm->n = sm->n_AVX_512;
  sm->kmem = sm->kmem_AVX_512;
  sm->S = sm->S_AVX_512;
  sm->nrow = sm->nrow_AVX_512;
  sm->ncells = sm->ncells_AVX_512; 

  return eslOK;

 ERROR:
  return eslEMEM;
  #endif
#ifndef HAVE_AVX512
return eslENORESULT;
#endif
}
Exemple #26
0
/* Function:  p7_hmmcache_Open()
 * Synopsis:  Cache a profile database.
 *
 * Purpose:   Open <hmmfile> and read all of its contents, creating
 *            a cached profile database in memory. Return a ptr to the 
 *            cached profile database in <*ret_cache>. 
 *            
 *            Caller may optionally provide an <errbuf> ptr to
 *            at least <eslERRBUFSIZE> bytes, to capture an 
 *            informative error message on failure. 
 *            
 * Args:      hmmfile   - (base) name of profile file to open
 *            ret_cache - RETURN: cached profile database
 *            errbuf    - optRETURN: error message for a failure
 *
 * Returns:   <eslOK> on success. <*ret_cache> points to the
 *            cached db. <errbuf> is unchanged.
 *            
 *            Failure codes:
 *            <eslENOTFOUND> : <hmmfile> couldn't be opened for reading
 *            <eslEFORMAT>   : <hmmfile> isn't in recognized HMMER file format
 *            <eslEINCOMPAT> : profiles in <hmmfile> have different alphabets
 *
 *            On any failure, <*ret_cache> is <NULL> and <errbuf> contains
 *            an informative error message for the user.
 *
 * Throws:    <eslEMEM> : memory allocation error.
 */
int
p7_hmmcache_Open(char *hmmfile, P7_HMMCACHE **ret_cache, char *errbuf)
{
  P7_HMMCACHE *cache    = NULL;
  P7_HMMFILE  *hfp      = NULL;     
  P7_HMM      *hmm      = NULL;
  P7_BG       *bg       = NULL;
  P7_PROFILE  *gm       = NULL;
  P7_OPROFILE *om       = NULL;     
  int          status;
  
  if (errbuf) errbuf[0] = '\0';

  ESL_ALLOC(cache, sizeof(P7_HMMCACHE));
  cache->name      = NULL;
  cache->abc       = NULL;
  cache->omlist    = NULL;
  cache->gmlist    = NULL;
  cache->lalloc    = 4096;	/* allocation chunk size for <list> of ptrs  */
  cache->n         = 0;

  if ( ( status = esl_strdup(hmmfile, -1, &cache->name) != eslOK)) goto ERROR; 
  ESL_ALLOC(cache->omlist, sizeof(P7_OPROFILE *) * cache->lalloc);
  ESL_ALLOC(cache->gmlist, sizeof(P7_PROFILE *)  * cache->lalloc);

  if ( (status = p7_hmmfile_OpenE(hmmfile, NULL, &hfp, errbuf)) != eslOK) goto ERROR;  // eslENOTFOUND | eslEFORMAT; <errbuf> 

  while ((status = p7_hmmfile_Read(hfp, &(cache->abc), &hmm)) != eslEOF)  // eslEFORMAT | eslEINCOMPAT; <errbuf>
    {
      if (status != eslOK) ESL_XFAIL(status, errbuf, "%s", hfp->errbuf); 

      if (!bg && (bg = p7_bg_Create(cache->abc)) == NULL)  { status = eslEMEM; goto ERROR; }

      if ( (    gm = p7_profile_Create(hmm->M, cache->abc)) == NULL)  { status = eslEMEM; goto ERROR; }
      if ( (status = p7_profile_Config(gm, hmm, bg)) != eslOK) goto ERROR;
 
      if ( (status = p7_oprofile_ReadMSV (hfp, &(cache->abc), &om)) != eslOK || /* eslEFORMAT: hfp->errbuf | eslEINCOMPAT | eslEOF */
	   (status = p7_oprofile_ReadRest(hfp, om))                 != eslOK)   /* eslEFORMAT: hfp->errbuf */
	{
	  if (status == eslEOF) ESL_XFAIL(eslEFORMAT, errbuf, "Premature EOF in vectorized profile files");
	  else                  goto ERROR;
	}

      ESL_DASSERT1(( strcmp(gm->name, om->name) == 0 ));

      if (cache->n >= cache->lalloc) {
	ESL_REALLOC(cache->gmlist, sizeof(P7_PROFILE  *) * cache->lalloc * 2);
	ESL_REALLOC(cache->omlist, sizeof(P7_OPROFILE *) * cache->lalloc * 2);
	cache->lalloc *= 2;
      }

      cache->omlist[cache->n] = om;
      cache->gmlist[cache->n] = gm;
      cache->n++;

      om = NULL;
      gm = NULL;
      p7_hmm_Destroy(hmm);
    }

  //printf("\nfinal:: %d  memory %" PRId64 "\n", inx, total_mem);
  p7_hmmfile_Close(hfp);
  p7_bg_Destroy(bg);
  *ret_cache = cache;
  return eslOK;

 ERROR:
  if (cache) p7_hmmcache_Close(cache);
  if (om)    p7_oprofile_Destroy(om);
  if (gm)    p7_profile_Destroy(gm);
  if (hmm)   p7_hmm_Destroy(hmm);
  if (bg)    p7_bg_Destroy(bg);
  if (hfp)   p7_hmmfile_Close(hfp);
  return status;
}
Exemple #27
0
/* Function:  esl_msafile_psiblast_Read()
 * Synopsis:  Read an alignment in PSI-BLAST's input format.
 *
 * Purpose:   Read an MSA from an open <ESLX_MSAFILE> <afp>, parsing for
 *            PSI-BLAST input format, starting from the current point.
 *            Create a new multiple alignment, and return a ptr to 
 *            that alignment via <*ret_msa>. Caller is responsible for
 *            free'ing this <ESL_MSA>.
 *            
 *            The <msa> has a reference line (<msa->rf[]>) that
 *            corresponds to the uppercase/lowercase columns in the
 *            alignment: consensus (uppercase) columns are marked 'x',
 *            and insert (lowercase) columns are marked '.' in this RF
 *            line.
 *            
 * Args:      afp     - open <ESL_MSAFILE>
 *            ret_msa - RETURN: newly parsed <ESL_MSA>
 *
 * Returns:   <eslOK> on success. <*ret_msa> contains the newly
 *            allocated MSA. <afp> is at EOF.
 *
 *            <eslEOF> if no (more) alignment data are found in
 *            <afp>, and <afp> is returned at EOF. 
 *
 *            <eslEFORMAT> on a parse error. <*ret_msa> is set to
 *            <NULL>. <afp> contains information sufficient for
 *            constructing useful diagnostic output: 
 *            | <afp->errmsg>       | user-directed error message     |
 *            | <afp->linenumber>   | line # where error was detected |
 *            | <afp->line>         | offending line (not NUL-term)   |
 *            | <afp->n>            | length of offending line        |
 *            | <afp->bf->filename> | name of the file                |
 *            and <afp> is poised at the start of the following line,
 *            so (in principle) the caller could try to resume
 *            parsing.
 *
 * Throws:    <eslEMEM> on allocation error.
 *            <eslESYS> if a system call fails, such as fread().
 *            <eslEINCONCEIVABLE> - "impossible" corruption 
 *            On these, <*ret_msa> is returned <NULL>, and the state of
 *            <afp> is undefined.
 */
int
esl_msafile_psiblast_Read(ESLX_MSAFILE *afp, ESL_MSA **ret_msa)
{
  ESL_MSA  *msa      = NULL;
  int       idx      = 0;	/* counter over sequences in a block */
  int       nblocks  = 0;	/* counter over blocks */
  int64_t   alen     = 0;	
  int       nseq     = 0;
  int64_t   cur_alen;
  esl_pos_t pos;		/* position on a line */
  esl_pos_t name_start,      name_len;
  esl_pos_t seq_start,       seq_len;
  esl_pos_t block_seq_start, block_seq_len;
  int       status;

  ESL_DASSERT1( (afp->format == eslMSAFILE_PSIBLAST) );

  afp->errmsg[0] = '\0';
  
  /* allocate a growable MSA. We set msa->{nseq,alen} only when we're done. */
#ifdef eslAUGMENT_ALPHABET
  if (afp->abc   &&  (msa = esl_msa_CreateDigital(afp->abc, 16, -1)) == NULL) { status = eslEMEM; goto ERROR; }
#endif
  if (! afp->abc &&  (msa = esl_msa_Create(                 16, -1)) == NULL) { status = eslEMEM; goto ERROR; }

  /* skip leading blank lines in file */
  while ( (status = eslx_msafile_GetLine(afp, NULL, NULL)) == eslOK && esl_memspn(afp->line, afp->n, " \t") == afp->n) ;
  if (status != eslOK)  goto ERROR; /* includes normal EOF */
  
  /* Read the file a line at a time; if a parsing error occurs, detect immediately, with afp->linenumber set correctly */
   do { /* while in the file... */
    idx = 0;
    do { /* while in a block... */
      for (pos = 0;     pos < afp->n; pos++) if (! isspace(afp->line[pos])) break;  name_start = pos; 
      for (pos = pos+1; pos < afp->n; pos++) if (  isspace(afp->line[pos])) break;  name_len   = pos - name_start;
      for (pos = pos+1; pos < afp->n; pos++) if (! isspace(afp->line[pos])) break;  seq_start  = pos;      
      if (pos >= afp->n) ESL_XFAIL(eslEFORMAT, afp->errmsg, "invalid alignment line");
      for (pos = afp->n-1; pos > 0; pos--)   if (! isspace(afp->line[pos])) break;  seq_len    = pos - seq_start + 1;

      if (idx == 0) {
	block_seq_start = seq_start;
	block_seq_len   = seq_len;
      } else {
	if (seq_start != block_seq_start) ESL_XFAIL(eslEFORMAT, afp->errmsg, "sequence start is misaligned");
	if (seq_len   != block_seq_len)   ESL_XFAIL(eslEFORMAT, afp->errmsg, "sequence end is misaligned");
      }
      
      /* Process the consensus #=RF line. */
      if (idx == 0) {
	ESL_REALLOC(msa->rf, sizeof(char) * (alen + seq_len + 1));
	for (pos = 0; pos < seq_len; pos++) msa->rf[alen+pos] = '-'; /* anything neutral other than . or x will do. */
	msa->rf[alen+pos] = '\0';
      }
      for (pos = 0; pos < seq_len; pos++) 
	{
	  if (afp->line[seq_start+pos] == '-') continue;
	  if (isupper(afp->line[seq_start+pos])) {
	    if (msa->rf[alen+pos] == '.') ESL_XFAIL(eslEFORMAT, afp->errmsg, "unexpected upper case residue (#%d on line)", (int) pos+1);
	    msa->rf[alen+pos] = 'x';
	  }
	  if (islower(afp->line[seq_start+pos])) {
	    if (msa->rf[alen+pos] == 'x') ESL_XFAIL(eslEFORMAT, afp->errmsg, "unexpected lower case residue (#%d on line)", (int) pos+1);
	    msa->rf[alen+pos] = '.';
	  }
	}

      /* Store the sequence name. */
      if (nblocks == 0)	{
	/* make sure we have room for another sequence */
	if (idx >= msa->sqalloc &&  (status = esl_msa_Expand(msa))                   != eslOK) goto ERROR;
	if ( (status = esl_msa_SetSeqName(msa, idx, afp->line+name_start, name_len)) != eslOK) goto ERROR;
      } else {
	if (! esl_memstrcmp(afp->line+name_start, name_len, msa->sqname[idx]))
	  ESL_XFAIL(eslEFORMAT, afp->errmsg, "expected sequence %s on this line, but saw %.*s", msa->sqname[idx], (int) name_len, afp->line+name_start);
      }

      /* Append the sequence. */
      cur_alen = alen;
#ifdef eslAUGMENT_ALPHABET
      if (msa->abc)    { status = esl_abc_dsqcat(afp->inmap, &(msa->ax[idx]),   &(cur_alen), afp->line+seq_start, seq_len); }
#endif
      if (! msa->abc)  { status = esl_strmapcat (afp->inmap, &(msa->aseq[idx]), &(cur_alen), afp->line+seq_start, seq_len); }
      if      (status == eslEINVAL)    ESL_XFAIL(eslEFORMAT, afp->errmsg, "one or more invalid sequence characters");
      else if (status != eslOK)        goto ERROR;
      if (cur_alen - alen != seq_len)  ESL_XFAIL(eslEFORMAT, afp->errmsg, "unexpected number of seq characters");
      
      /* get next line. if it's blank, or if we're EOF, we're done with the block */
      idx++;
      status = eslx_msafile_GetLine(afp, NULL, NULL);
    } while (status == eslOK && esl_memspn(afp->line, afp->n, " \t") < afp->n); /* blank line ends a block. */
    if (status != eslOK && status != eslEOF) goto ERROR; 
    /* End of one block */
    
    if     (nblocks == 0) nseq = idx;
    else if (idx != nseq) ESL_XFAIL(eslEFORMAT, afp->errmsg, "last block didn't contain same # of seqs as earlier blocks");
    alen += block_seq_len;
    nblocks++;

    /* skip blank lines to start of next block, if any */
    while ( (status = eslx_msafile_GetLine(afp, NULL, NULL)) == eslOK  && esl_memspn(afp->line, afp->n, " \t") == afp->n) ;
   } while (status == eslOK);
   if (status != eslEOF) goto ERROR;
   
   msa->nseq = nseq;
   msa->alen = alen;
   if (( status = esl_msa_SetDefaultWeights(msa)) != eslOK) goto ERROR;
   *ret_msa  = msa;
   return eslOK;

 ERROR:
  if (msa) esl_msa_Destroy(msa);
  *ret_msa = NULL;
  return status;
}
Exemple #28
0
/* Function:  esl_msafile_a2m_Read()
 * Synopsis:  Read a UCSC A2M format alignment.
 *
 * Purpose:   Read an MSA from an open <ESL_MSAFILE> <afp>, parsing
 *            for UCSC A2M (SAM) format. Create a new MSA,
 *            and return a ptr to it in <*ret_msa>. Caller is responsible
 *            for freeing this <ESL_MSA>.
 *            
 *            The <msa> has a reference line (<msa->rf[]>) that
 *            corresponds to the uppercase/lowercase columns in the
 *            alignment: consensus (uppercase) columns are marked 'X',
 *            and insert (lowercase) columns are marked '.' in the RF
 *            annotation line.
 *
 *            This input parser can deal both with "dotless" A2M, and
 *            full A2M format with dots.
 *            
 * Args:      afp     - open <ESL_MSAFILE>
 *            ret_msa - RETURN: newly parsed <ESL_MSA>
 *
 * Returns:   <eslOK> on success. <*ret_msa> is set to the newly
 *            allocated MSA, and <afp> is at EOF.
 *
 *            <eslEOF> if no (more) alignment data are found in
 *            <afp>, and <afp> is returned at EOF. 
 *
 *            <eslEFORMAT> on a parse error. <*ret_msa> is set to
 *            <NULL>. <afp> contains information sufficient for
 *            constructing useful diagnostic output: 
 *            | <afp->errmsg>       | user-directed error message     |
 *            | <afp->linenumber>   | line # where error was detected |
 *            | <afp->line>         | offending line (not NUL-term)   |
 *            | <afp->n>            | length of offending line        |
 *            | <afp->bf->filename> | name of the file                |
 *            and <afp> is poised at the start of the following line,
 *            so (in principle) the caller could try to resume
 *            parsing.
 *            
 * Throws:    <eslEMEM> - an allocation failed.
 *            <eslESYS> - a system call such as fread() failed
 *            <eslEINCONCEIVABLE> - "impossible" corruption 
 *            On these, <*ret_msa> is returned <NULL>, and the state of
 *            <afp> is undefined.
 */
int
esl_msafile_a2m_Read(ESL_MSAFILE *afp, ESL_MSA **ret_msa)
{
  ESL_MSA  *msa        = NULL;
  char    **csflag     = NULL;	/* csflag[i][pos] is TRUE if aseq[i][pos] was uppercase consensus   */
  int      *nins       = NULL;	/* # of inserted residues before each consensus col [0..ncons-1]    */
  int      *this_nins  = NULL;	/* # of inserted residues before each consensus residue in this seq */
  int       nseq       = 0;
  int       ncons      = 0;
  int       idx;
  int64_t   thislen;
  int64_t   spos;
  int       this_ncons;
  int       cpos, bpos;
  char     *p, *tok;
  esl_pos_t n,  toklen;
  int       status;

  ESL_DASSERT1( (afp->format == eslMSAFILE_A2M) );

  afp->errmsg[0] = '\0';	

#ifdef eslAUGMENT_ALPHABET
  if (afp->abc   &&  (msa = esl_msa_CreateDigital(afp->abc, 16, -1)) == NULL) { status = eslEMEM; goto ERROR; }
#endif
  if (! afp->abc &&  (msa = esl_msa_Create(                 16, -1)) == NULL) { status = eslEMEM; goto ERROR; }
  ESL_ALLOC(csflag, sizeof(char *) * msa->sqalloc);
  for (idx = 0; idx < msa->sqalloc; idx++) csflag[idx] = NULL; 

  /* skip leading blank lines in file */
  while ( (status = esl_msafile_GetLine(afp, &p, &n)) == eslOK  && esl_memspn(afp->line, afp->n, " \t") == afp->n) ;
  if      (status != eslOK) goto ERROR; /* includes normal EOF */

  /* tolerate sloppy space at start of name/desc line */
  while (n && isspace(*p)) { p++; n--; }    
  if (*p != '>') ESL_XFAIL(eslEFORMAT, afp->errmsg, "expected A2M name/desc line starting with >");    

  do {	/* for each record starting in '>': */
    p++; n--; 			/* advance past > */
    
    if ( (status = esl_memtok(&p, &n, " \t", &tok, &toklen))   != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "no name found for A2M record");
    if (nseq >= msa->sqalloc) {
      int old_sqalloc = msa->sqalloc;
      if ( (status = esl_msa_Expand(msa)) != eslOK) goto ERROR;
      ESL_REALLOC(csflag, sizeof(char *) * msa->sqalloc);
      for (idx = old_sqalloc; idx < msa->sqalloc; idx++) csflag[idx] = NULL;
    }

    if (     (status = esl_msa_SetSeqName       (msa, nseq, tok, toklen)) != eslOK) goto ERROR;
    if (n && (status = esl_msa_SetSeqDescription(msa, nseq, p,   n))      != eslOK) goto ERROR;

    /* now for each sequence line... */
    thislen = 0;		/* count of lowercase, uppercase, and '-': w/o dots, on first pass */
    this_ncons = 0;		/* count of uppercase + '-': number of consensus columns in alignment: must match for all seqs */
    if (nseq) {
      for (cpos = 0; cpos <= ncons; cpos++) // A little tricksy. <this_nins> is allocated on first seq, when nseq=0. 
	this_nins[cpos] = 0;                // cppcheck gets confused and erroneously calls "possible null pointer deference"; ignore it.
    }

    while ( (status = esl_msafile_GetLine(afp, &p, &n)) == eslOK)
      {				
	while (n && isspace(*p)) { p++; n--; } /* tolerate and skip leading whitespace on line */
	if (n  == 0)   continue;	       /* tolerate and skip blank lines */
	if (*p == '>') break;

	ESL_REALLOC(csflag[nseq], sizeof(char) * (thislen + n + 1)); /* might be an overalloc by a bit, depending on whitespace on line */
	if (nseq == 0) {
	  ESL_REALLOC(this_nins, sizeof(int) * (this_ncons + n + 1));
	  for (cpos = this_ncons; cpos <= this_ncons+n; cpos++)
	    this_nins[cpos] = 0;
	}

	for (spos = thislen, bpos = 0; bpos < n; bpos++)
	  {
	    if      (p[bpos] == 'O')   continue;
	    else if (isupper(p[bpos])) { csflag[nseq][spos++] = TRUE;  this_ncons++;            }
	    else if (islower(p[bpos])) { csflag[nseq][spos++] = FALSE; this_nins[this_ncons]++; }
	    else if (p[bpos] == '-')   { csflag[nseq][spos++] = TRUE;  this_ncons++;            }
	    if (ncons && this_ncons > ncons) ESL_XFAIL(eslEFORMAT, afp->errmsg,  "unexpected # of consensus residues, didn't match previous seq(s)");
	  }
	csflag[nseq][spos] = TRUE; /* need a sentinel, because of the way the padding functions work */

#ifdef eslAUGMENT_ALPHABET
	if (msa->abc)   { status = esl_abc_dsqcat(afp->inmap, &(msa->ax[nseq]),   &thislen, p, n); } 
#endif
	if (! msa->abc) { status = esl_strmapcat (afp->inmap, &(msa->aseq[nseq]), &thislen, p, n); }
	if (status == eslEINVAL)   ESL_XFAIL(eslEFORMAT, afp->errmsg, "one or more invalid sequence characters");
	else if (status != eslOK)  goto ERROR;
	ESL_DASSERT1( (spos == thislen) );
      }	
    if (status != eslOK && status != eslEOF) goto ERROR; /* exception thrown by esl_msafile_GetLine() */
    /* status == OK: then *p == '>'. status == eslEOF: we're eof.  status == anything else: error */
    /* Finished reading a sequence record. */
    
    if (nseq == 0) 
      {
	ncons = this_ncons;
	ESL_ALLOC(nins, sizeof(int) * (ncons+1));
	for (cpos = 0; cpos <= ncons; cpos++)
	  nins[cpos] = this_nins[cpos];
      } 
    else 
      {
	if (this_ncons != ncons) ESL_XFAIL(eslEFORMAT, afp->errmsg, "unexpected # of consensus residues, didn't match previous seq(s)");
	for (cpos = 0; cpos <= ncons; cpos++) 
	  nins[cpos]      = ESL_MAX(nins[cpos], this_nins[cpos]);
      }
    nseq++;
  } while (status == eslOK);
  
  /* Now we have nseq *unaligned* sequences in ax/aseq[0..nseq-1]; call the length slen, though we don't explicitly store it
   * csflag[idx][spos] tells us whether each unaligned residue is an insertion or consensus, for spos==0..slen-1.
   * nins[0..ncons] tells us the max number of inserted residues before each consensus column
   * This is sufficient information to reconstruct each aligned sequence.
   */
  msa->nseq = nseq;
#ifdef eslAUGMENT_ALPHABET
  if (msa->abc)  { if ((status = a2m_padding_digital(msa, csflag, nins, ncons)) != eslOK) goto ERROR; }
#endif
  if (!msa->abc) { if ((status = a2m_padding_text   (msa, csflag, nins, ncons)) != eslOK) goto ERROR; }

  if (( status = esl_msa_SetDefaultWeights(msa)) != eslOK) goto ERROR;

  *ret_msa  = msa;
  free(nins);
  free(this_nins);
  for (idx = 0; idx < msa->nseq; idx++) free(csflag[idx]);
  free(csflag);
  return eslOK;
  
 ERROR:
  if (nins)      free(nins);
  if (this_nins) free(this_nins);
  if (csflag) {
    for (idx = 0; idx < msa->nseq; idx++) 
      if (csflag[idx]) free(csflag[idx]);
    free(csflag);
  }
  if (msa) esl_msa_Destroy(msa);
  return status;
}
Exemple #29
0
/* Function:  p7_coords2_hash_Store()
 * Synopsis:  Store a <P7_COORDS2> array and get a key index for it.
 *
 * Purpose:   In the hash table <ch>, store the array of coordinate
 *            pairs in <c2>.  Associate it with a unique key index,
 *            counting from 0. This index lets us map the hashed data
 *            to integer-based C arrays. Return the index through <opt_index>.
 *            
 *            If an identical array of paired coords has already been
 *            stored, then set <*opt_index> to the index of where the
 *            data were already stored, and return <eslEDUP>
 *
 * Args:      ch         : hash table holding different arrays of coord pairs
 *            c2         : new array of coord pairs to try to store
 *            opt_index  : optRETURN: index of stored data
 *            
 * Returns:   <eslOK> if <seg>/<nseg> is new; the data are stored, 
 *            and <opt_index>, if requested, is set to the lookup 
 *            key index for the stored data.
 *            
 *            <eslEDUP> if <seg>/<nseg> has already been stored before;
 *            <opt_index>, if requested, is set to the lookup key
 *            index of the previously stored data.
 *
 * Throws:    <eslEMEM> on allocation failure. 
 */
int
p7_coords2_hash_Store(P7_COORDS2_HASH *ch, const P7_COORDS2 *c2, int32_t *opt_index)
{
  uint32_t  val = p7_coords2_hash_function(c2->arr, c2->n, ch->hashsize);
  int32_t  *ptr;
  int32_t   idx;
  int32_t   d;
  int       status;
  
  /* Was this key already stored? */
  for (idx = ch->hashtable[val]; idx != -1; idx = ch->nxt[idx])
    {
      if (p7_coords2_hash_compare(c2->arr, c2->n, ch->cmem + ch->key_offset[idx]) == eslOK)
	{
	  if (opt_index) *opt_index = idx;
	  return eslEDUP;
	}
    }

  /* Reallocate key memory if needed */
  if (ch->nkeys == ch->kalloc)
    {
      ESL_REALLOC(ch->key_offset, sizeof(int32_t) * ch->kalloc * 2);
      ESL_REALLOC(ch->nxt,        sizeof(int32_t) * ch->kalloc * 2);
      ch->kalloc *= 2;
    }

  /* Reallocate key data memory if needed */
  while (ch->cn + 2 * c2->n + 1 > ch->calloc)
    {
      ESL_REALLOC(ch->cmem, sizeof(int32_t) * ch->calloc * 2);
      ch->calloc *= 2;
    }

  /* Copy the key, assign its index */
  idx                 = ch->nkeys;
  ch->key_offset[idx] = ch->cn;
  ch->cn             += 2 * c2->n + 1;
  ch->nkeys++;

  ptr  = ch->cmem + ch->key_offset[idx];
  *ptr = c2->n;
  for (d = 0; d < c2->n; d++) 
    {
      ptr++; *ptr = c2->arr[d].n1;
      ptr++; *ptr = c2->arr[d].n2;
    }

  /* Insert new element at head of the approp chain in hashtable */
  ch->nxt[idx]       = ch->hashtable[val];
  ch->hashtable[val] = idx;

  /* Time to upsize? If we're 3x saturated, expand the hash table */
  if (ch->nkeys > 3 * ch->hashsize)
    if ((status = p7_coords2_hash_upsize(ch)) != eslOK) goto ERROR;

  if (opt_index) *opt_index = idx;
  return eslOK;

 ERROR:
  if (opt_index) *opt_index = -1;
  return status;

}
Exemple #30
0
/* regurgitate_pfam_as_pfam()
 * 
 * Given an open Pfam formatted msafile, read the next alignment and
 * regurgitate it, after modifying it as necessary (change dna to rna,
 * wussify SS, etc) in Pfam format.
 * 
 * Returns <eslOK> on success. 
 * Returns <eslEOF> if there are no more alignments in <afp>.
 * Returns <eslEFORMAT> if parse fails because of a file format
 * problem, in which case afp->errmsg is set to contain a formatted
 * message that indicates the cause of the problem.
 */
static int
regurgitate_pfam_as_pfam(ESLX_MSAFILE *afp, FILE *ofp, char *gapsym, int force_lower, int force_upper, int force_rna, int force_dna, int iupac_to_n, int x_is_bad, int wussify, int dewuss, int fullwuss, char *rfrom, char *rto)
{
  char      *p;
  esl_pos_t  n;
  char      *first_seqname = NULL;
  char      *gx      = NULL;
  char      *seqname = NULL;
  char      *tag     = NULL;
  char      *text    = NULL;
  esl_pos_t  gxlen, namelen, taglen, textlen;
  int        nseq_read = 0;
  int        parse_gc_and_gr;
  int        flushpoint = 10000;
  int        exp_alen = -1;
  char      *buf      = NULL;
  esl_pos_t  pos, pos2;
  int        status;


  parse_gc_and_gr = (wussify || dewuss || fullwuss) ? TRUE : FALSE; /* should we parse out GR/GC lines and check if they're SS lines? */
  afp->errmsg[0] = '\0';
   
  /* Check the magic Stockholm header line.
   * We have to skip blank lines here, else we perceive
   * trailing blank lines in a file as a format error when
   * reading in multi-record mode.
   */
  /* Check the magic Stockholm header line, allowing blank lines */
  do { 
    status = eslx_msafile_GetLine(afp, &p, &n);
    if      (status == eslEOF) return eslEOF; 
    else if (status != eslOK)  esl_fatal("small mem parse error. problem reading line %d of msafile", (int) afp->linenumber);
    fprintf(ofp, "%.*s\n", (int) afp->n, afp->line);
  } while (esl_memspn(afp->line, afp->n, " \t") == afp->n  ||                 /* skip blank lines             */
	       (esl_memstrpfx(afp->line, afp->n, "#")                         /* and skip comment lines       */
	   && ! esl_memstrpfx(afp->line, afp->n, "# STOCKHOLM")));            /* but stop on Stockholm header */

  if (! esl_memstrpfx(afp->line, afp->n, "# STOCKHOLM 1.")) esl_fatal("small mem parse failed (line %d): missing \"# STOCKHOLM\" header", (int) afp->linenumber);

  /* Read the alignment file one line at a time.  */
  while ((status = eslx_msafile_GetLine(afp, &p, &n)) == eslOK) 
    {
      if ((int) afp->linenumber % flushpoint == 0) fflush(ofp);
      while (n && ( *p == ' ' || *p == '\t')) { p++; n--; } /* skip leading whitespace */
    
      if      (!n)                          fprintf(ofp, "\n");
      else if (esl_memstrpfx(p, n, "//")) { fprintf(ofp, "//\n"); break; } /* normal way out */
      else if (*p == '#') 
	{
	  if (parse_gc_and_gr && esl_memstrpfx(p, n, "#=GC")) 
	    {  	/* parse line into temporary strings */
	      if (esl_memtok(&p, &n, " \t",  &gx,   &gxlen)   != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "small mem parse failed (line %d): bad #=GC line", (int) afp->linenumber);
	      if (esl_memtok(&p, &n, " \t",  &tag,  &taglen)  != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "small mem parse failed (line %d): bad #=GC line", (int) afp->linenumber);
	      if (esl_memtok(&p, &n,  " \t", &text, &textlen) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "small mem parse failed (line %d): bad #=GC line", (int) afp->linenumber);
	      pos = text - afp->line; /* pos: position of first aligned char on line; total width of annotation tag w/spaces */
	
	      /* verify alignment length */
	      if      (exp_alen == -1)      exp_alen = textlen;
	      else if (exp_alen != textlen) ESL_XFAIL(eslEFORMAT, afp->errmsg, "small mem parse failed (line %d): bad #=GC line, len %d, expected %d", (int) afp->linenumber, (int) textlen, (int) exp_alen);
	
	      /* we need to make a writable string copy of the annotation, to edit it */
	      ESL_REALLOC(buf, sizeof(char) * (textlen+1));
	      esl_memstrcpy(text, textlen, buf);
	     
	      if (esl_memstrcmp(tag, taglen, "SS_cons")) 
		{
		  if      (wussify)  esl_kh2wuss(buf, buf);
		  else if (dewuss)   esl_wuss2kh(buf, buf);
		  else if (fullwuss) 
		    { 
		      status = esl_wuss_full(buf, buf);
		      if      (status == eslESYNTAX) esl_fatal("Bad SS_cons line: not in WUSS format, alifile line: %d", (int) afp->linenumber);
		      else if (status != eslOK)      esl_fatal("Conversion of SS_cons line failed, code %d, alifile line: %d", status, (int) afp->linenumber);
		    }
		}		  
	      fprintf(ofp, "#=GC %.*s%*s%s\n", (int) taglen, tag, (int) (pos-taglen-5), "", buf);
	    }
	  else if (parse_gc_and_gr && esl_memstrpfx(p, n, "#=GR") == 0) 
	    { 
	      /* parse line into temporary strings */
	      if (esl_memtok(&p, &n, " \t", &gx,      &gxlen)   != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "--small parse failed (line %d): bad #=GR line", (int) afp->linenumber);
	      if (esl_memtok(&p, &n, " \t", &seqname, &namelen) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "--small parse failed (line %d): bad #=GR line", (int) afp->linenumber);
	      if (esl_memtok(&p, &n, " \t", &tag,     &taglen)  != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "--small parse failed (line %d): bad #=GR line", (int) afp->linenumber);
	      pos = tag   - afp->line;
	      if (esl_memtok(&p, &n, " \t", &text,    &textlen) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "--small parse failed (line %d): bad #=GR line", (int) afp->linenumber);
	      pos2 = text - afp->line;

	      /* we need to make a writable string copy of the annotation, to edit it */
	      ESL_REALLOC(buf, sizeof(char) * (textlen+1));
	      esl_memstrcpy(text, textlen, buf);

	      /* verify alignment length */
	      if      (exp_alen == -1)      exp_alen = textlen;
	      else if (exp_alen != textlen) ESL_XFAIL(eslEFORMAT, afp->errmsg, "small mem parse failed (line %d): bad seq line, len %d, expected %d", (int) afp->linenumber, (int) textlen, (int) exp_alen);
	
	      if (esl_memstrcmp(tag, taglen, "SS") == 0) 
		{
		  if      (wussify)  esl_kh2wuss(buf, buf);
		  else if (dewuss)   esl_wuss2kh(buf, buf);
		  else if (fullwuss) { 
		    status = esl_wuss_full(buf, buf);
		    if      (status == eslESYNTAX) esl_fatal("Bad SS line: not in WUSS format, alifile line: %d", (int) afp->linenumber);
		    else if (status != eslOK)      esl_fatal("Conversion of SS line failed, code %d, alifile line: %d", status, (int) afp->linenumber);
		  }
		}		  

	      fprintf(ofp, "#=GR %.*s%*s%.*s%*s%s\n", (int) namelen, seqname, (int) (pos-namelen-5), "", (int) taglen, tag, (int) (pos2-pos-taglen), "", buf);
	    }
	  else { /* '#' prefixed line that is not #=GR (or it is #=GR and wussify,dewuss,fullwuss are all FALSE) */
	    fprintf(ofp, "%.*s\n", (int) afp->n, afp->line); /* print the line */
	  }
	} /* end of 'if (*s == '#')' */ 
      else 
	{ /* sequence line */
	  if (esl_memtok(&p, &n, " \t", &seqname, &namelen) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "--small parse failed (line %d): bad sequence line", (int) afp->linenumber);
	  if (esl_memtok(&p, &n, " \t", &text,    &textlen) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "--small parse failed (line %d): bad sequence line", (int) afp->linenumber);
	  pos = text - afp->line;

	  /* verify alignment length */
	  if     (exp_alen == -1)      exp_alen = textlen;
	  else if(exp_alen != textlen) ESL_XFAIL(eslEFORMAT, afp->errmsg, "small mem parse failed (line %d): bad seq line, len %d, expected %d", (int) afp->linenumber, (int) textlen, (int) exp_alen);
      
	  /* make sure we haven't just read a second line of the first sequence in file (we must be in Pfam 1 line/seq file) */
	  if (nseq_read == 0) { if ((status = esl_memstrdup(seqname, namelen, &(first_seqname))) != eslOK) goto ERROR; }
	  else if (esl_memstrcmp(seqname, namelen, first_seqname)) { ESL_XFAIL(eslEFORMAT, afp->errmsg, "parse failed (line %d): two seqs named %s. Alignment appears to be in Stockholm format. Reformat to Pfam with esl-reformat.", (int) afp->linenumber, seqname); }
	  nseq_read++;
      
	  /* we need to make a writable string copy of the annotation, to edit it */
	  ESL_REALLOC(buf, sizeof(char) * (textlen+1));
	  esl_memstrcpy(text, textlen, buf);

	  /* make adjustments as necessary */
	  if (rfrom)       symconvert(buf, rfrom, rto);
	  if (gapsym)      symconvert(buf, "-_.", gapsym);
	  if (force_lower) symconvert(buf,
				      "ABCDEFGHIJKLMNOPQRSTUVWXYZ",
				      "abcdefghijklmnopqrstuvwxyz");
	  if (force_upper) symconvert(buf,
				      "abcdefghijklmnopqrstuvwxyz",
				      "ABCDEFGHIJKLMNOPQRSTUVWXYZ");
	  if (force_rna)   symconvert(buf, "Tt", "Uu");
	  if (force_dna)   symconvert(buf, "Uu", "Tt");
	  if (iupac_to_n)  symconvert(buf, 
				      "RYMKSWHBVDrymkswhbvd",
				      "NNNNNNNNNNnnnnnnnnnn");
	  if (x_is_bad)    symconvert(buf,   "Xx", "Nn");
	  /* print it out */
	  fprintf(ofp, "%.*s%*s%s\n", (int) namelen, seqname, (int) (pos-namelen), "", buf);
	}
    }

  /* If we saw a normal // end, we would've successfully read a line,
   * so when we get here, status (from the line read) should be eslOK.
   */ 
  if (status != eslOK) esl_fatal("--small parse failed (line %d): didn't find // at end of alignment", (int) afp->linenumber);
  if (first_seqname) free(first_seqname);
  if (buf)           free(buf);
  return eslOK;

 ERROR:
  return status;
}