Beispiel #1
0
/* this is an obsolete derivative of old p7_tophits_Add(), which the
 * test driver happens to use; we should refactor.
 * The unit test only needs name and sort key;
 * we add the acc and desc too; and everything else is
 * at a default unused value.
 */
static int
tophits_Add(P7_TOPHITS *h, char *name, char *acc, char *desc, double sortkey)
{
  P7_HIT *hit = NULL;

  p7_tophits_CreateNextHit(h, &hit);
  esl_strdup(name, -1, &(hit->name));
  esl_strdup(acc,  -1, &(hit->acc));
  esl_strdup(desc, -1, &(hit->desc));
  hit->sortkey    = sortkey;
  return eslOK;
}
/* Function:  p7_profile_Copy()
 * Synopsis:  Copy a profile.
 *
 * Purpose:   Copies profile <src> to profile <dst>, where <dst>
 *            has already been allocated to be of sufficient size,
 *            and has the same alphabet.
 *
 * Returns:   <eslOK> on success.
 * 
 * Throws:    <eslEMEM> on allocation error; <eslEINVAL> if <dst> is too small 
 *            to fit <src> or is for a different alphabet.
 */
int
p7_profile_Copy(const P7_PROFILE *src, P7_PROFILE *dst)
{
  int x,z;
  int status;

  if (src->M         >   dst->allocM)   ESL_EXCEPTION(eslEINVAL, "destination profile is too small to hold a copy of source profile");
  if (src->abc->type != dst->abc->type) ESL_EXCEPTION(eslEINVAL, "destination profile has different alphabet than source");

  dst->M = src->M;
  esl_vec_FCopy(src->tsc, (src->M+1)*p7P_NTRANS, dst->tsc);
  for (x = 0; x < src->abc->Kp;   x++) esl_vec_FCopy(src->rsc[x], (src->M+1)*p7P_NR, dst->rsc[x]);
  for (x = 0; x < p7P_NXSTATES;   x++) esl_vec_FCopy(src->xsc[x], p7P_NXTRANS,       dst->xsc[x]);

  dst->L           = src->L;
  dst->nj          = src->nj;
  dst->pglocal     = src->pglocal;

  if (dst->name) free(dst->name);   
  if (dst->acc)  free(dst->acc);    
  if (dst->desc) free(dst->desc);   
  if ((status = esl_strdup(src->name, -1, &(dst->name)))      != eslOK) return status; 
  if ((status = esl_strdup(src->acc,  -1, &(dst->acc)))       != eslOK) return status; 
  if ((status = esl_strdup(src->desc, -1, &(dst->desc)))      != eslOK) return status; 

  strcpy(dst->rf,        src->rf);         /* RF is optional: if it's not set, *rf=0, and strcpy still works fine */
  strcpy(dst->mm,        src->mm);         /* MM is also optional annotation */
  strcpy(dst->cs,        src->cs);         /* CS is also optional annotation */
  strcpy(dst->consensus, src->consensus);  /* consensus though is always present on a valid profile */

  for (z = 0; z < p7_NEVPARAM; z++) dst->evparam[z] = src->evparam[z];
  for (z = 0; z < p7_NCUTOFFS; z++) dst->cutoff[z]  = src->cutoff[z];
  for (z = 0; z < p7_MAXABET;  z++) dst->compo[z]   = src->compo[z];

  for (x = 0; x < p7_NOFFSETS; ++x) dst->offs[x] = src->offs[x];
  dst->roff        = src->roff;
  dst->eoff        = src->eoff;

  dst->max_length  = src->max_length;
  return eslOK;
}
Beispiel #3
0
/* Function:  hmmpgmd_GetRanges()
 * Synopsis:  Parse command flag into range(s)
 *
 * Purpose:   Given a command flag string <rangestr> of the form
 *            <start1>..<end1>,<start2>..<end2>...
 *            parse the string into a RANGE_LIST <list>
 *
 * Returns:   <eslOK> on success <TRUE>, <eslEMEM> on memory allocation failure,
 *            otherwise <eslESYNTAX> or <eslFAIL> on parsing errors.
 */
int
hmmpgmd_GetRanges (RANGE_LIST *list, char *rangestr)  {
  char *range;
  char *rangestr_cpy;
  char *rangestr_cpy_ptr;
  int status;

  list->N      = 0;
  list->starts = NULL;
  list->ends   = NULL;

  //first pass to figure out how much to allocate
  esl_strdup(rangestr, -1, &rangestr_cpy); // do this because esl_strtok modifies the string, and we shouldn't change the opts value
  rangestr_cpy_ptr = rangestr_cpy;         // do this because esl_strtok advances the pointer on the target string, but we need to free it
  while ( (status = esl_strtok(&rangestr_cpy, ",", &range) ) == eslOK)  list->N++;
  ESL_ALLOC(list->starts, list->N * sizeof(int));
  ESL_ALLOC(list->ends,   list->N * sizeof(int));
  free(rangestr_cpy_ptr);

  //2nd pass to get the values
  list->N = 0;
  esl_strdup(rangestr, -1, &rangestr_cpy);
  rangestr_cpy_ptr = rangestr_cpy;
  while ( (status = esl_strtok(&rangestr_cpy, ",", &range) ) == eslOK) {
    status = esl_regexp_ParseCoordString(range, list->starts + list->N, list->ends + list->N);
    if (status == eslESYNTAX) esl_fatal("--seqdb_ranges takes coords <from>..<to>; %s not recognized", range);
    if (status == eslFAIL)    esl_fatal("Failed to find <from> or <to> coord in %s", range);
    list->N++;
  }
  free(rangestr_cpy_ptr);

  return eslOK;

ERROR:
  return eslEMEM;
}
Beispiel #4
0
/* Function:  p7_ProfileConfig()
 * Synopsis:  Configure a search profile.
 *
 * Purpose:   Given a model <hmm> with core probabilities, the null1
 *            model <bg>, a desired search <mode> (one of <p7_LOCAL>,
 *            <p7_GLOCAL>, <p7_UNILOCAL>, or <p7_UNIGLOCAL>), and an
 *            expected target sequence length <L>; configure the
 *            search model in <gm> with lod scores relative to the
 *            background frequencies in <bg>.
 *            
 * Returns:   <eslOK> on success; the profile <gm> now contains 
 *            scores and is ready for searching target sequences.
 *            
 * Throws:    <eslEMEM> on allocation error.
 */
int
p7_ProfileConfig(const P7_HMM *hmm, const P7_BG *bg, P7_PROFILE *gm, int L, int mode)
{
  int   k, x, z;	/* counters over states, residues, annotation */
  int   status;
  float *occ = NULL;
  float *tp, *rp;
  float  sc[p7_MAXCODE];
  float  Z;
 
  /* Contract checks */
  if (gm->abc->type != hmm->abc->type) ESL_XEXCEPTION(eslEINVAL, "HMM and profile alphabet don't match");
  if (hmm->M > gm->allocM)             ESL_XEXCEPTION(eslEINVAL, "profile too small to hold HMM");
  if (! (hmm->flags & p7H_CONS))       ESL_XEXCEPTION(eslEINVAL, "HMM must have a consensus to transfer to the profile");

  /* Copy some pointer references and other info across from HMM  */
  gm->M                = hmm->M;
  gm->max_length       = hmm->max_length;
  gm->mode             = mode;
  gm->roff             = -1;
  gm->eoff             = -1;
  gm->offs[p7_MOFFSET] = -1;
  gm->offs[p7_FOFFSET] = -1;
  gm->offs[p7_POFFSET] = -1;
  if (gm->name != NULL) free(gm->name);
  if (gm->acc  != NULL) free(gm->acc);
  if (gm->desc != NULL) free(gm->desc);
  if ((status = esl_strdup(hmm->name,   -1, &(gm->name))) != eslOK) goto ERROR;
  if ((status = esl_strdup(hmm->acc,    -1, &(gm->acc)))  != eslOK) goto ERROR;
  if ((status = esl_strdup(hmm->desc,   -1, &(gm->desc))) != eslOK) goto ERROR;
  if (hmm->flags & p7H_RF)    strcpy(gm->rf,        hmm->rf);
  if (hmm->flags & p7H_MMASK) strcpy(gm->mm,        hmm->mm);
  if (hmm->flags & p7H_CONS)  strcpy(gm->consensus, hmm->consensus); /* must be present, actually, so the flag test is just for symmetry w/ other optional HMM fields */
  if (hmm->flags & p7H_CS)    strcpy(gm->cs,        hmm->cs);
  for (z = 0; z < p7_NEVPARAM; z++) gm->evparam[z] = hmm->evparam[z];
  for (z = 0; z < p7_NCUTOFFS; z++) gm->cutoff[z]  = hmm->cutoff[z];
  for (z = 0; z < p7_MAXABET;  z++) gm->compo[z]   = hmm->compo[z];

  /* Entry scores. */
  if (p7_profile_IsLocal(gm))
    {
      /* Local mode entry:  occ[k] /( \sum_i occ[i] * (M-i+1))
       * (Reduces to uniform 2/(M(M+1)) for occupancies of 1.0)  */
      Z = 0.;
      ESL_ALLOC(occ, sizeof(float) * (hmm->M+1));

      if ((status = p7_hmm_CalculateOccupancy(hmm, occ, NULL)) != eslOK) goto ERROR;
      for (k = 1; k <= hmm->M; k++) 
	Z += occ[k] * (float) (hmm->M-k+1);
      for (k = 1; k <= hmm->M; k++) 
	p7P_TSC(gm, k-1, p7P_BM) = log(occ[k] / Z); /* note off-by-one: entry at Mk stored as [k-1][BM] */

      free(occ);
    }
  else	/* glocal modes: left wing retraction; must be in log space for precision */
    {
      Z = log(hmm->t[0][p7H_MD]);
      p7P_TSC(gm, 0, p7P_BM) = log(1.0 - hmm->t[0][p7H_MD]);
      for (k = 1; k < hmm->M; k++) 
	{
	   p7P_TSC(gm, k, p7P_BM) = Z + log(hmm->t[k][p7H_DM]);
	   Z += log(hmm->t[k][p7H_DD]);
	}
    }

  /* E state loop/move probabilities: nonzero for MOVE allows loops/multihits
   * N,C,J transitions are set later by length config 
   */
  if (p7_profile_IsMultihit(gm)) {
    gm->xsc[p7P_E][p7P_MOVE] = -eslCONST_LOG2;   
    gm->xsc[p7P_E][p7P_LOOP] = -eslCONST_LOG2;   
    gm->nj                   = 1.0f;
  } else {
    gm->xsc[p7P_E][p7P_MOVE] = 0.0f;   
    gm->xsc[p7P_E][p7P_LOOP] = -eslINFINITY;  
    gm->nj                   = 0.0f;
  }

  /* Transition scores. */
  for (k = 1; k < gm->M; k++) {
    tp = gm->tsc + k * p7P_NTRANS;
    tp[p7P_MM] = log(hmm->t[k][p7H_MM]);
    tp[p7P_MI] = log(hmm->t[k][p7H_MI]);
    tp[p7P_MD] = log(hmm->t[k][p7H_MD]);
    tp[p7P_IM] = log(hmm->t[k][p7H_IM]);
    tp[p7P_II] = log(hmm->t[k][p7H_II]);
    tp[p7P_DM] = log(hmm->t[k][p7H_DM]);
    tp[p7P_DD] = log(hmm->t[k][p7H_DD]);
  }
  
  /* Match emission scores. */
  sc[hmm->abc->K]     = -eslINFINITY; /* gap character */
  sc[hmm->abc->Kp-2]  = -eslINFINITY; /* nonresidue character */
  sc[hmm->abc->Kp-1]  = -eslINFINITY; /* missing data character */
  for (k = 1; k <= hmm->M; k++) {
    for (x = 0; x < hmm->abc->K; x++)
     sc[x] = log((double)hmm->mat[k][x] / bg->f[x]);

    esl_abc_FExpectScVec(hmm->abc, sc, bg->f); 

    for (x = 0; x < hmm->abc->Kp; x++) {
      rp = gm->rsc[x] + k * p7P_NR;
      rp[p7P_MSC] = sc[x];
    }
  }

  /* Insert emission scores */
  /* SRE, Fri Dec 5 08:41:08 2008: We currently hardwire insert scores
   * to 0, i.e. corresponding to the insertion emission probabilities
   * being equal to the background probabilities. Benchmarking shows
   * that setting inserts to informative emission distributions causes
   * more problems than it's worth: polar biased composition hits
   * driven by stretches of "insertion" occur, and are difficult to
   * correct for.
   */
  for (x = 0; x < gm->abc->Kp; x++)
    {
      for (k = 1; k < hmm->M; k++) p7P_ISC(gm, k, x) = 0.0f;
      p7P_ISC(gm, hmm->M, x) = -eslINFINITY;   /* init I_M to impossible.   */
    }
  for (k = 1; k <= hmm->M; k++) p7P_ISC(gm, k, gm->abc->K)    = -eslINFINITY; /* gap symbol */
  for (k = 1; k <= hmm->M; k++) p7P_ISC(gm, k, gm->abc->Kp-2) = -eslINFINITY; /* nonresidue symbol */
  for (k = 1; k <= hmm->M; k++) p7P_ISC(gm, k, gm->abc->Kp-1) = -eslINFINITY; /* missing data symbol */


#if 0
  /* original (informative) insert setting: relies on sc[K, Kp-1] initialization to -inf above */
  for (k = 1; k < hmm->M; k++) {
    for (x = 0; x < hmm->abc->K; x++) 
      sc[x] = log(hmm->ins[k][x] / bg->f[x]); 
    esl_abc_FExpectScVec(hmm->abc, sc, bg->f); 
    for (x = 0; x < hmm->abc->Kp; x++) {
      rp = gm->rsc[x] + k*p7P_NR;
      rp[p7P_ISC] = sc[x];
    }
  }    
  for (x = 0; x < hmm->abc->Kp; x++)
    p7P_ISC(gm, hmm->M, x) = -eslINFINITY;   /* init I_M to impossible.   */
#endif

  /* Remaining specials, [NCJ][MOVE | LOOP] are set by ReconfigLength()
   */
  gm->L = 0;			/* force ReconfigLength to reconfig */
  if ((status = p7_ReconfigLength(gm, L)) != eslOK) goto ERROR;
  return eslOK;

 ERROR:
  if (occ != NULL) free(occ);
  return status;
}
static int
profillic_esl_msafile_profile_Read(ESLX_MSAFILE *afp, ESL_MSA **ret_msa, ProfileType * profile_ptr )
{
  /// \note Right now this isn't actually using the open file pointer; for convenience I just use the profile.fromFile( <filename> ) method.
  /// \todo Use convenience fns in esl_buffer.h; see eg hmmer-3.1/easel/esl_msafile_stockholm.c for examples...
  ESL_MSA                 *msa      = NULL;
  string profile_string;
  char *buf;
  long len;
  int                      seqidx;
  int                      status;
  char       errmsg2[eslERRBUFSIZE];

  ESL_DASSERT1((afp->format == eslMSAFILE_PROFILLIC));

  const char * const seqname = "Galosh Profile Consensus";
  const char * const msaname = "Galosh Profile";
  uint32_t profile_length;
  galosh::Sequence<typename ProfileType::ProfileResidueType> consensus_sequence;
  stringstream tmp_consensus_output_stream;

  uint32_t pos_i;

  if (profile_ptr == NULL)  { ESL_EXCEPTION(eslEINCONCEIVABLE, "profile_ptr is NULL in profillic_esl_msafile_profile_Read(..)!"); }
  //if (feof(afp->bf->fp))  { status = eslEOF; goto ERROR; }
  afp->errmsg[0] = '\0';

  // Read in the galosh profile (from profillic)
  //fseek( afp->bf->fp, 0, SEEK_END ); // go to the end
  //len = afp->bf->ftell( afp->bf->fp ); // get the position at the end (length)
  //fseek( afp->bf->fp, 0, SEEK_SET ); // go to the beginning again.

  //ESL_ALLOC_CPP( char, buf, sizeof( char ) * len ); //malloc buffer
  //fread( buf, len, 1, afp->bf->fp ); //read into buffer

  //profile_string = buf;
  //profile_ptr->fromString( profile_string );
  profile_ptr->fromFile( afp->bf->filename );
  //if (buf)      free(buf);
  // \todo WHY WON'T THIS WORK?  See HACKs in profillic-hmmbuild.cpp to work around it.
  //fseek( afp->bf->fp, 0, SEEK_END ); // go to the end (to signal there's no more profiles in the file, the next time we come to this function)

  // Calculate the consensus sequence.
  profile_length = profile_ptr->length();
  consensus_sequence.reinitialize( profile_length );
  for( pos_i = 0; pos_i < profile_length; pos_i++ ) {
    consensus_sequence[ pos_i ] =
      ( *profile_ptr )[ pos_i ][ galosh::Emission::Match ].maximumValueType();
  }
  tmp_consensus_output_stream << consensus_sequence;

  /* Allocate a growable MSA, and auxiliary parse data coupled to the MSA allocation */
#ifdef eslAUGMENT_ALPHABET
  if (afp->abc   &&  (msa = esl_msa_CreateDigital(afp->abc, 16, -1)) == NULL) { status = eslEMEM; goto ERROR; }
#endif
  if (! afp->abc &&  (msa = esl_msa_Create(                 16, -1)) == NULL) { status = eslEMEM; goto ERROR; }


  // Set first-and-only seq to the consensus.  This should set sqlen[0] to the profile's length and set ax to have length 1 and ax[0] to be the sequence itself.  Also msa->sqname[0] to the "name" of that consensus sequence.

  /* if nec, make room for the new seq */
  if (msa->nseq >= msa->sqalloc && (status = esl_msa_Expand(msa)) != eslOK) return status; 
  seqidx = msa->nseq; // 0
  msa->nseq++; // = 1
  status = esl_strdup(seqname, -1, &(msa->sqname[seqidx]));
  // NOTE: Could add description of this "sequence" here, using esl_msa_SetSeqDescription(msa, seqidx, desc).
#ifdef eslAUGMENT_ALPHABET
  if (msa->flags & eslMSA_DIGITAL)
    {
      // NOTE (profillic): There was a bug in this; it had said .."esl_abc_dsqcat(msa->abc, " where it should have said .."esl_abc_dsqcat(msa->abc->inmap, "
      if((status = esl_abc_dsqcat(msa->abc->inmap, &(msa->ax[seqidx]), &(msa->sqlen[seqidx]), tmp_consensus_output_stream.str().c_str(), profile_length)) != eslOK) {
        /* invalid char(s), get informative error message */
        if (esl_abc_ValidateSeq(msa->abc, tmp_consensus_output_stream.str().c_str(), profile_length, afp->errmsg) != eslOK) 
          ESL_XFAIL(eslEFORMAT, errmsg2, "%s (line %d): %s", msa->sqname[0], afp->linenumber, afp->errmsg);
      }
    }
#endif
  if (! (msa->flags & eslMSA_DIGITAL))
    {
      status = esl_strcat(&(msa->aseq[seqidx]), 0, tmp_consensus_output_stream.str().c_str(), profile_length);
      msa->sqlen[seqidx] = profile_length;
    } 
  msa->alen = profile_length;

  /// \todo OR read in a fasta file of sequences too.
  /// \todo (Optional?) Set msa->name to the name of the profile (file?)
  esl_strdup(msaname, -1, &(msa->name));
  /// \todo make sure eslMSA_HASWGTS is FALSE .. OR set it to TRUE and set msa->wgt[idx] to 1.0.
  /// \note Could have secondary structure (per sequence) too. msa->ss[0]. msa->sslen[0] should be the same as msa->sqlen[0].
  /// \todo Investigate what msa->sa and msa->pp are for.

  /* Give the newly parsed MSA a good
   * going-over, and finalize the fields of the MSA data structure.
   * verify_parse will fill in errmsg if it sees a problem.
   */
  //if (verify_parse(msa, afp->errmsg) != eslOK) { status = eslEFORMAT; goto ERROR; } 

  if (( status = esl_msa_SetDefaultWeights(msa)) != eslOK) goto ERROR;

  if (ret_msa != NULL) *ret_msa = msa; else esl_msa_Destroy(msa);
  return eslOK;

 ERROR:
  if (msa != NULL)      esl_msa_Destroy(msa);
  if (ret_msa != NULL) *ret_msa = NULL;
  return status;
}
Beispiel #6
0
int
main(int argc, char **argv)
{
  ESL_GETOPTS    *go       = p7_CreateDefaultApp(options, 0, argc, argv, banner, usage);
  ESL_STOPWATCH  *w        = esl_stopwatch_Create();
  ESL_RANDOMNESS *r        = esl_randomness_CreateFast(esl_opt_GetInteger(go, "-s"));
  int             N        = esl_opt_GetInteger(go, "-N");
  int             M        = esl_opt_GetInteger(go, "-M");
  P7_TOPHITS    **h        = NULL;
  P7_HIT         *hit      = NULL;
  double         *sortkeys = NULL;
  char            name[]   = "not_unique_name";
  char            acc[]    = "not_unique_acc";
  char            desc[]   = "Test description for the purposes of making the benchmark allocate space";
  int             i,j;
  int             status;

  /* prep work: generate our sort keys before starting to time anything    */
  ESL_ALLOC(h,        sizeof(P7_TOPHITS *) * M); /* allocate pointers for M lists */
  ESL_ALLOC(sortkeys, sizeof(double) * N * M);   
  for (i = 0; i < N*M; i++) sortkeys[i] = esl_random(r);

  esl_stopwatch_Start(w);

  /* generate M "random" lists and sort them */
  for (j = 0; j < M; j++)
  {
      h[j] = p7_tophits_Create(p7_TOPHITS_DEFAULT_INIT_ALLOC);
      for (i = 0; i < N; i++)
	{
	  p7_tophits_CreateNextHit(h[j], &hit);
	  esl_strdup(name, -1, &(hit->name));
	  esl_strdup(acc,  -1, &(hit->acc));
	  esl_strdup(desc, -1, &(hit->desc));
	  hit->sortkey    = sortkeys[j*N + i];
	  hit->score      = (float) sortkeys[j*N+i]; 
	  hit->pre_score  = 0.0;
	  hit->sum_score  = 0.0;
	  hit->lnP        = sortkeys[j*N+i];
	  hit->pre_lnP    = 0.0;
	  hit->sum_lnP    = 0.0;
	  hit->ndom       = N;
	  hit->noverlaps  = 0;
	  hit->nexpected  = 0;
	  hit->flags      = 0;
	  hit->nreported  = 0;
	  hit->nincluded  = 0;
	  hit->best_domain = 0;
	  hit->dcl         = NULL;
	}
      p7_tophits_SortBySortkey(h[j]);
  }
  /* then merge them into one big list in h[0] */
  for (j = 1; j < M; j++)
  {
      p7_tophits_Merge(h[0], h[j]);
      p7_tophits_Destroy(h[j]);
  }

  esl_stopwatch_Stop(w);

  p7_tophits_Destroy(h[0]);
  status = eslOK;
 ERROR:
  esl_getopts_Destroy(go);
  esl_stopwatch_Destroy(w);
  esl_randomness_Destroy(r);
  if (sortkeys != NULL) free(sortkeys);
  if (h != NULL) free(h);
  return status;
}
int
main(int argc, char **argv)
{

  int i,j;

  ESL_GETOPTS     *go = NULL;	/* command line processing                 */
  ESL_STOPWATCH   *w  = esl_stopwatch_Create();

  int              status;
  ESL_MSA      *msa         = NULL;
  FILE         *ofp         = NULL;    /* output file (default is stdout) */
  ESL_ALPHABET *abc         = NULL;    /* digital alphabet */

  char         *alifile;  /* name of the alignment file we're building HMMs from  */
  ESLX_MSAFILE *afp         = NULL;            /* open alifile  */
  int           infmt       = eslMSAFILE_UNKNOWN;    /* autodetect alignment format by default. */
  int           outfmt      = eslMSAFILE_STOCKHOLM;


  char         *postmsafile;  /* optional file to resave annotated, modified MSAs to  */
  FILE         *postmsafp = NULL;  /* open <postmsafile>, or NULL */

  int           mask_range_cnt = 0;
  uint32_t      mask_starts[100]; // over-the-top allocation.
  uint32_t      mask_ends[100];

  char         *rangestr;
  char         *range;


  int     *map = NULL; /* map[i]=j,  means model position i comes from column j of the alignment; 1..alen */

  int    keep_mm;

  /* Set processor specific flags */
  impl_Init();
  alifile     = NULL;
  postmsafile = NULL;

  /* Parse the command line
   */
  process_commandline(argc, argv, &go, &alifile, &postmsafile);
  keep_mm = esl_opt_IsUsed(go, "--apendmask");

  /* Initialize what we can in the config structure (without knowing the alphabet yet).
   * Fields controlled by masters are set up in usual_master() or mpi_master()
   * Fields used by workers are set up in mpi_worker()
   */
  ofp         = NULL;
  infmt         = eslMSAFILE_UNKNOWN;
  afp         = NULL;
  abc         = NULL;

  if (esl_opt_IsOn(go, "--informat")) {
    infmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--informat"));
    if (infmt == eslMSAFILE_UNKNOWN) p7_Fail("%s is not a recognized input sequence file format\n", esl_opt_GetString(go, "--informat"));
  }

  /* Determine output alignment file format */
  outfmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--outformat"));
  if (outfmt == eslMSAFILE_UNKNOWN)    p7_Fail(argv[0], "%s is not a recognized output MSA file format\n", esl_opt_GetString(go, "--outformat"));



  /* Parse the ranges */

  if (esl_opt_IsUsed(go, "--alirange")) {
    esl_strdup(esl_opt_GetString(go, "--alirange"), -1, &rangestr) ;
  } else if (esl_opt_IsUsed(go, "--modelrange")) {
    esl_strdup(esl_opt_GetString(go, "--modelrange"), -1, &rangestr) ;
  } else if (esl_opt_IsUsed(go, "--model2ali")) {
    esl_strdup(esl_opt_GetString(go, "--model2ali"), -1, &rangestr) ;
  } else if (esl_opt_IsUsed(go, "--ali2model")) {
    esl_strdup(esl_opt_GetString(go, "--ali2model"), -1, &rangestr) ;
  } else {
    if (puts("Must specify mask range with --modelrange, --alirange, --model2ali, or --ali2model\n") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto ERROR;
  }

  while ( (status = esl_strtok(&rangestr, ",", &range) ) == eslOK) {
    status = esl_regexp_ParseCoordString(range, mask_starts + mask_range_cnt, mask_ends + mask_range_cnt );
    if (status == eslESYNTAX) esl_fatal("range flags take coords <from>..<to>; %s not recognized", range);
    if (status == eslFAIL)    esl_fatal("Failed to find <from> or <to> coord in %s", range);

    mask_range_cnt++;
  }


  /* Start timing. */
  esl_stopwatch_Start(w);


  /* Open files, set alphabet.
   *   afp       - open alignment file for input
   *   abc       - alphabet expected or guessed in ali file
   *   postmsafp - open MSA output file
   *   ofp       - optional open output file, or stdout
   */
  if      (esl_opt_GetBoolean(go, "--amino"))   abc = esl_alphabet_Create(eslAMINO);
  else if (esl_opt_GetBoolean(go, "--dna"))     abc = esl_alphabet_Create(eslDNA);
  else if (esl_opt_GetBoolean(go, "--rna"))     abc = esl_alphabet_Create(eslRNA);
  else                                          abc = NULL;
  
  status = eslx_msafile_Open(&abc, alifile, NULL, infmt, NULL, &afp);
  if (status != eslOK) eslx_msafile_OpenFailure(afp, status);

  if (esl_opt_IsUsed(go, "--alirange") || esl_opt_IsUsed(go, "--modelrange") ) {
    postmsafp = fopen(postmsafile, "w");
    if (postmsafp == NULL) p7_Fail("Failed to MSA output file %s for writing", postmsafile);
  }

  if (esl_opt_IsUsed(go, "-o")) 
    {
      ofp = fopen(esl_opt_GetString(go, "-o"), "w");
      if (ofp == NULL) p7_Fail("Failed to open -o output file %s\n", esl_opt_GetString(go, "-o"));
    } 
  else ofp = stdout;


  /* Looks like the i/o is set up successfully...
   * Initial output to the user
   */
  output_header(go, ofp, alifile, postmsafile);                                  /* cheery output header                                */

  /* read the alignment */
  if ((status = eslx_msafile_Read(afp, &msa)) != eslOK)  eslx_msafile_ReadFailure(afp, status);


  if (esl_opt_IsUsed(go, "--alirange") || esl_opt_IsUsed(go, "--modelrange") ) {
    /* add/modify mmline for the mask */
    if (msa->mm == NULL) {
      ESL_ALLOC(msa->mm, msa->alen);
      keep_mm = FALSE;
    }

    if (!keep_mm)
      for (i=0; i<msa->alen; i++) msa->mm[i] = '.';

  }

  // convert model coordinates to alignment coordinates, if necessary
  if (esl_opt_IsUsed(go, "--modelrange") || esl_opt_IsUsed(go, "--model2ali") || esl_opt_IsUsed(go, "--ali2model") ) {

    float symfrac = esl_opt_GetReal(go, "--symfrac");
    int do_hand  =  esl_opt_IsOn(go, "--hand");
    int L;

    //same as p7_builder relative_weights
    if      (esl_opt_IsOn(go, "--wnone")  )                  { esl_vec_DSet(msa->wgt, msa->nseq, 1.); }
    else if (esl_opt_IsOn(go, "--wgiven") )                  ;
    else if (esl_opt_IsOn(go, "--wpb")    )                  status = esl_msaweight_PB(msa);
    else if (esl_opt_IsOn(go, "--wgsc")   )                  status = esl_msaweight_GSC(msa);
    else if (esl_opt_IsOn(go, "--wblosum"))                  status = esl_msaweight_BLOSUM(msa, esl_opt_GetReal(go, "--wid"));

    if ((status =  esl_msa_MarkFragments(msa, esl_opt_GetReal(go, "--fragthresh")))           != eslOK) goto ERROR;

    //build a map of model mask coordinates to alignment coords
    ESL_ALLOC(map, sizeof(int)     * (msa->alen+1));
    L = p7_Alimask_MakeModel2AliMap(msa, do_hand, symfrac, map );


    if ( esl_opt_IsUsed(go, "--model2ali") ) {
      //print mapping
      printf ("model coordinates     alignment coordinates\n");
      for (i=0; i<mask_range_cnt; i++)
        printf ("%8d..%-8d -> %8d..%-8d\n", mask_starts[i], mask_ends[i], map[mask_starts[i]-1], map[mask_ends[i]-1]);
      /* If I wanted to, I could print all the map values independently:
        printf("\n\n-----------\n");
        printf("Map\n");
        printf("---\n");
        for (i=0; i<L; i++)
          printf("%d -> %d\n", i+1, map[i]);
      */
    } else if ( esl_opt_IsUsed(go, "--ali2model") ) {
      //print mapping  (requires scanning the inverted map
      int alistart = 0;
      int aliend = 0;
      printf ("alignment coordinates     model coordinates\n");
      for (i=0; i<mask_range_cnt; i++) {
        //find j for ali positions
        while (map[alistart] < mask_starts[i] )
          alistart++;
        aliend = alistart;
        while (map[aliend] < mask_ends[i] )
          aliend++;

        printf ("   %8d..%-8d -> %8d..%-8d\n", map[alistart], map[aliend], alistart+1, aliend+1);
      }
    } else {
      //convert the mask coords based on map
      for (i=0; i<mask_range_cnt; i++) {
          mask_starts[i] = map[mask_starts[i]-1]; //-1 because mmline is offset by one relative to the 1-base alignment
          mask_ends[i]   = map[mask_ends[i]-1];
      }
    }
  }

  if (esl_opt_IsUsed(go, "--alirange") || esl_opt_IsUsed(go, "--modelrange") ) {
    //overwrite '.' with 'm' everywhere the range says to do it
    for (i=0; i<mask_range_cnt; i++)
      for (j=mask_starts[i]; j<=mask_ends[i]; j++)
        msa->mm[j-1] = 'm';

    if ((status = eslx_msafile_Write(postmsafp, msa, outfmt))  != eslOK) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed");
  }

  esl_stopwatch_Stop(w);

  if (esl_opt_IsOn(go, "-o"))  fclose(ofp);
  if (postmsafp) fclose(postmsafp);
  if (afp)   eslx_msafile_Close(afp);
  if (abc)   esl_alphabet_Destroy(abc);

  esl_getopts_Destroy(go);
  esl_stopwatch_Destroy(w);
  return 0;


  ERROR:
   return eslFAIL;
}
Beispiel #8
0
/* Function:  p7_hmmcache_Open()
 * Synopsis:  Cache a profile database.
 *
 * Purpose:   Open <hmmfile> and read all of its contents, creating
 *            a cached profile database in memory. Return a ptr to the 
 *            cached profile database in <*ret_cache>. 
 *            
 *            Caller may optionally provide an <errbuf> ptr to
 *            at least <eslERRBUFSIZE> bytes, to capture an 
 *            informative error message on failure. 
 *            
 * Args:      hmmfile   - (base) name of profile file to open
 *            ret_cache - RETURN: cached profile database
 *            errbuf    - optRETURN: error message for a failure
 *
 * Returns:   <eslOK> on success. <*ret_cache> points to the
 *            cached db. <errbuf> is unchanged.
 *            
 *            Failure codes:
 *            <eslENOTFOUND> : <hmmfile> couldn't be opened for reading
 *            <eslEFORMAT>   : <hmmfile> isn't in recognized HMMER file format
 *            <eslEINCOMPAT> : profiles in <hmmfile> have different alphabets
 *
 *            On any failure, <*ret_cache> is <NULL> and <errbuf> contains
 *            an informative error message for the user.
 *
 * Throws:    <eslEMEM> : memory allocation error.
 */
int
p7_hmmcache_Open(char *hmmfile, P7_HMMCACHE **ret_cache, char *errbuf)
{
  P7_HMMCACHE *cache    = NULL;
  P7_HMMFILE  *hfp      = NULL;     
  P7_HMM      *hmm      = NULL;
  P7_BG       *bg       = NULL;
  P7_PROFILE  *gm       = NULL;
  P7_OPROFILE *om       = NULL;     
  int          status;
  
  if (errbuf) errbuf[0] = '\0';

  ESL_ALLOC(cache, sizeof(P7_HMMCACHE));
  cache->name      = NULL;
  cache->abc       = NULL;
  cache->omlist    = NULL;
  cache->gmlist    = NULL;
  cache->lalloc    = 4096;	/* allocation chunk size for <list> of ptrs  */
  cache->n         = 0;

  if ( ( status = esl_strdup(hmmfile, -1, &cache->name) != eslOK)) goto ERROR; 
  ESL_ALLOC(cache->omlist, sizeof(P7_OPROFILE *) * cache->lalloc);
  ESL_ALLOC(cache->gmlist, sizeof(P7_PROFILE *)  * cache->lalloc);

  if ( (status = p7_hmmfile_OpenE(hmmfile, NULL, &hfp, errbuf)) != eslOK) goto ERROR;  // eslENOTFOUND | eslEFORMAT; <errbuf> 

  while ((status = p7_hmmfile_Read(hfp, &(cache->abc), &hmm)) != eslEOF)  // eslEFORMAT | eslEINCOMPAT; <errbuf>
    {
      if (status != eslOK) ESL_XFAIL(status, errbuf, "%s", hfp->errbuf); 

      if (!bg && (bg = p7_bg_Create(cache->abc)) == NULL)  { status = eslEMEM; goto ERROR; }

      if ( (    gm = p7_profile_Create(hmm->M, cache->abc)) == NULL)  { status = eslEMEM; goto ERROR; }
      if ( (status = p7_profile_Config(gm, hmm, bg)) != eslOK) goto ERROR;
 
      if ( (status = p7_oprofile_ReadMSV (hfp, &(cache->abc), &om)) != eslOK || /* eslEFORMAT: hfp->errbuf | eslEINCOMPAT | eslEOF */
	   (status = p7_oprofile_ReadRest(hfp, om))                 != eslOK)   /* eslEFORMAT: hfp->errbuf */
	{
	  if (status == eslEOF) ESL_XFAIL(eslEFORMAT, errbuf, "Premature EOF in vectorized profile files");
	  else                  goto ERROR;
	}

      ESL_DASSERT1(( strcmp(gm->name, om->name) == 0 ));

      if (cache->n >= cache->lalloc) {
	ESL_REALLOC(cache->gmlist, sizeof(P7_PROFILE  *) * cache->lalloc * 2);
	ESL_REALLOC(cache->omlist, sizeof(P7_OPROFILE *) * cache->lalloc * 2);
	cache->lalloc *= 2;
      }

      cache->omlist[cache->n] = om;
      cache->gmlist[cache->n] = gm;
      cache->n++;

      om = NULL;
      gm = NULL;
      p7_hmm_Destroy(hmm);
    }

  //printf("\nfinal:: %d  memory %" PRId64 "\n", inx, total_mem);
  p7_hmmfile_Close(hfp);
  p7_bg_Destroy(bg);
  *ret_cache = cache;
  return eslOK;

 ERROR:
  if (cache) p7_hmmcache_Close(cache);
  if (om)    p7_oprofile_Destroy(om);
  if (gm)    p7_profile_Destroy(gm);
  if (hmm)   p7_hmm_Destroy(hmm);
  if (bg)    p7_bg_Destroy(bg);
  if (hfp)   p7_hmmfile_Close(hfp);
  return status;
}
Beispiel #9
0
/* Function:  p7_ProfileConfig()
* Synopsis:  Configure a search profile.
* Incept:    SRE, Sun Sep 25 12:21:25 2005 [St. Louis]
*
* Purpose:   Given a model <hmm> with core probabilities, the null1
*            model <bg>, a desired search <mode> (one of <p7_LOCAL>,
*            <p7_GLOCAL>, <p7_UNILOCAL>, or <p7_UNIGLOCAL>), and an
*            expected target sequence length <L>; configure the
*            search model in <gm> with lod scores relative to the
*            background frequencies in <bg>.
*            
* Returns:   <eslOK> on success; the profile <gm> now contains 
*            scores and is ready for searching target sequences.
*            
* Throws:    <eslEMEM> on allocation error.
*/
int
p7_ProfileConfig(const P7_HMM *hmm, const P7_BG *bg, P7_PROFILE *gm, int L, int mode)
{
    int   k, x, z;	/* counters over states, residues, annotation */
    int   status;
    float *occ = NULL;
    float *tp, *rp;
    float  sc[p7_MAXCODE];
    float  mthresh;
    float  Z;

    /* Contract checks */
    if (gm->abc->type != hmm->abc->type) ESL_XEXCEPTION(eslEINVAL, "HMM and profile alphabet don't match");
    if (hmm->M > gm->allocM)             ESL_XEXCEPTION(eslEINVAL, "profile too small to hold HMM");

    /* Copy some pointer references and other info across from HMM  */
    gm->M      = hmm->M;
    gm->mode   = mode;
    gm->roff   = -1;
    gm->eoff   = -1;
    gm->offs[p7_MOFFSET] = -1;
    gm->offs[p7_FOFFSET] = -1;
    gm->offs[p7_POFFSET] = -1;
    if (gm->name != NULL) free(gm->name);
    if (gm->acc  != NULL) free(gm->acc);
    if (gm->desc != NULL) free(gm->desc);
    if ((status = esl_strdup(hmm->name,   -1, &(gm->name))) != eslOK) goto ERROR;
    if ((status = esl_strdup(hmm->acc,    -1, &(gm->acc)))  != eslOK) goto ERROR;
    if ((status = esl_strdup(hmm->desc,   -1, &(gm->desc))) != eslOK) goto ERROR;
    if (hmm->flags & p7H_RF) strcpy(gm->rf, hmm->rf);
    if (hmm->flags & p7H_CS) strcpy(gm->cs, hmm->cs);
    for (z = 0; z < p7_NEVPARAM; z++) gm->evparam[z] = hmm->evparam[z];
    for (z = 0; z < p7_NCUTOFFS; z++) gm->cutoff[z]  = hmm->cutoff[z];
    for (z = 0; z < p7_MAXABET;  z++) gm->compo[z]   = hmm->compo[z];

    /* Determine the "consensus" residue for each match position.
    * This is only used for alignment displays, not in any calculations.
    */
    if      (hmm->abc->type == eslAMINO) mthresh = 0.5;
    else if (hmm->abc->type == eslDNA)   mthresh = 0.9;
    else if (hmm->abc->type == eslRNA)   mthresh = 0.9;
    else                                 mthresh = 0.5;
    gm->consensus[0] = ' ';
    for (k = 1; k <= hmm->M; k++) {
        x = esl_vec_FArgMax(hmm->mat[k], hmm->abc->K);
        gm->consensus[k] = ((hmm->mat[k][x] > mthresh) ? toupper(hmm->abc->sym[x]) : tolower(hmm->abc->sym[x]));
    }
    gm->consensus[hmm->M+1] = '\0';

    /* Entry scores. */
    if (p7_profile_IsLocal(gm))
    {
        /* Local mode entry:  occ[k] /( \sum_i occ[i] * (M-i+1))
        * (Reduces to uniform 2/(M(M+1)) for occupancies of 1.0)  */
        Z = 0.;
        ESL_ALLOC_WITH_TYPE(occ, float*, sizeof(float) * (hmm->M+1));

        if ((status = p7_hmm_CalculateOccupancy(hmm, occ, NULL)) != eslOK) goto ERROR;
        for (k = 1; k <= hmm->M; k++) 
            Z += occ[k] * (float) (hmm->M-k+1);
        for (k = 1; k <= hmm->M; k++) 
            p7P_TSC(gm, k-1, p7P_BM) = log((double)(occ[k] / Z)); /* note off-by-one: entry at Mk stored as [k-1][BM] */

        free(occ);
    }
    else	/* glocal modes: left wing retraction; must be in log space for precision */
    {
Beispiel #10
0
int
p7_seqcache_Open(char *seqfile, P7_SEQCACHE **ret_cache, char *errbuf)
{
  int                i;
  int                inx;
  int                val;
  int                status;

  int32_t            seq_cnt;
  int32_t            db_cnt;
  int32_t            db_inx[32];
  uint32_t           db_key;

  uint64_t           res_cnt;
  uint64_t           res_size;
  uint64_t           hdr_size;

  char              *hdr_ptr;
  char              *res_ptr;
  char              *desc_ptr;
  char              *ptr;
  char               buffer[512];
  off_t              offset;

  uint64_t           total_mem;

  SEQ_DB            *db         = NULL;
  P7_SEQCACHE       *cache      = NULL;

  ESL_RANDOMNESS    *rnd        = NULL;
  ESL_SQFILE        *sqfp       = NULL;
  ESL_SQ            *sq         = NULL;
  ESL_ALPHABET      *abc        = NULL;
  ESL_SQASCII_DATA  *ascii      = NULL;

  if (errbuf) errbuf[0] = '\0';	/* CURRENTLY UNUSED. FIXME */

  /* Open the target sequence database */
  if ((status = esl_sqfile_Open(seqfile, eslSQFILE_FASTA, NULL, &sqfp)) != eslOK) return status;

  /* This is a bit of a hack.  The first line contains database information.
   *
   * #<res_count> <seq_count> <db_count> <db_sequences_1> <db_sequences_before_removing_duplicates_1> <db_sequences_2> <db_sequences_before_removing_duplicates_2>  ... <date_stamp>
   *
   * The rest of the file is a fasta format.  The fasta header is just
   * sequence number followed by a binary number indicating which
   * database this sequence occurs in.
   *
   * The header line will be read in, parsed and saved.  Then the
   * parser will be repositioned after the line and used normally.
   */
  ascii = &sqfp->data.ascii;
  fseek(ascii->fp, 0L, SEEK_SET);
  if (fgets(buffer, sizeof(buffer), ascii->fp) == NULL) return eslEFORMAT;
  if (buffer[0] != '#')                                 return eslEFORMAT;

  ptr = buffer + 1;
  res_cnt = strtoll(ptr, &ptr, 10);
  seq_cnt = strtol(ptr, &ptr, 10);
  db_cnt  = strtol(ptr, &ptr, 10);

  if (db_cnt > (sizeof(db_inx)/sizeof(db_inx[0])))      return eslEFORMAT;

  total_mem = sizeof(P7_SEQCACHE);
  ESL_ALLOC(cache, sizeof(P7_SEQCACHE));
  memset(cache, 0, sizeof(P7_SEQCACHE));

  if (esl_strdup(seqfile, -1, &cache->name) != eslOK)   goto ERROR;

  total_mem += (sizeof(HMMER_SEQ) * seq_cnt);
  ESL_ALLOC(cache->list, sizeof(HMMER_SEQ) * seq_cnt);
  memset(cache->list, 0, sizeof(HMMER_SEQ) * seq_cnt);

  total_mem += (sizeof(SEQ_DB) * db_cnt);
  ESL_ALLOC(db, sizeof(SEQ_DB) * db_cnt);
  for (i = 0; i < db_cnt; ++i) {
    db[i].count  = strtol(ptr, &ptr, 10);
    db[i].K      = strtol(ptr, &ptr, 10);
    total_mem   += (sizeof(HMMER_SEQ *) * db[i].count);
    ESL_ALLOC(db[i].list, sizeof(HMMER_SEQ *) * db[i].count);
    memset(db[i].list, 0, sizeof(HMMER_SEQ *) * db[i].count);
  }

  /* grab the unique identifier */
  while (*ptr && isspace(*ptr)) ++ptr;
  i = strlen(ptr);
  ESL_ALLOC(cache->id, i+1);
  strcpy(cache->id, ptr);
  while (--i > 0 && isspace(cache->id[i])) cache->id[i] = 0;

  res_size = res_cnt + seq_cnt + 1;
  hdr_size = seq_cnt * 10;

  total_mem += res_size + hdr_size;
  ESL_ALLOC(cache->residue_mem, res_size);
  ESL_ALLOC(cache->header_mem, hdr_size);

  /* position the sequence file to the start of the first sequence.
   * this will force any buffers associated with the file to be reset.
   */
  offset = ftell(ascii->fp);
  if ((status = esl_sqfile_Position(sqfp, offset)) != eslOK) goto ERROR;

  abc = esl_alphabet_Create(eslAMINO);
  sq  = esl_sq_CreateDigital(abc);

  cache->db_cnt      = db_cnt;
  cache->db          = db;
  cache->abc         = abc;
  cache->res_size    = res_size;
  cache->hdr_size    = hdr_size;
  cache->count       = seq_cnt;

  hdr_ptr = cache->header_mem;
  res_ptr = cache->residue_mem;
  for (i = 0; i < db_cnt; ++i) db_inx[i] = 0;

  strcpy(buffer, "000000001");
  
  inx = 0;
  while ((status = esl_sqio_Read(sqfp, sq)) == eslOK) {

    /* sanity checks */
    if (inx >= seq_cnt)       { printf("inx: %d\n", inx); return eslEFORMAT; }
    if (sq->n + 1 > res_size) { printf("inx: %d size %d %d\n", inx, (int)sq->n + 1, (int)res_size); return eslEFORMAT; }
    if (hdr_size <= 0)        { printf("inx: %d hdr %d\n", inx, (int)hdr_size); return eslEFORMAT; }

    /* generate the database key - modified to take the first word in the desc line.
     * The remaining part of the desc is then cached as the description.  */

    ptr = sq->desc;;
    desc_ptr = strchr(sq->desc, ' ');
    if(desc_ptr != NULL) {
    	*desc_ptr= '\0';
    	++desc_ptr;
    }
    val = 1;
    db_key = 0;
    while (*ptr) {
      if (*ptr == '1') db_key += val;
      val <<= 1;
      ++ptr;
    }


    if (db_key >= (1 << (db_cnt + 1))) { printf("inx: %d db %d %s\n", inx, db_key, sq->desc); return eslEFORMAT; }

    cache->list[inx].name   = hdr_ptr;
    cache->list[inx].dsq    = (ESL_DSQ *)res_ptr;
    cache->list[inx].n      = sq->n;
    cache->list[inx].idx    = inx;
    cache->list[inx].db_key = db_key;
    if(desc_ptr != NULL) esl_strdup(desc_ptr, -1, &(cache->list[inx].desc));

    /* copy the digitized sequence */
    memcpy(res_ptr, sq->dsq, sq->n + 1);
    res_ptr  += (sq->n + 1);
    res_size -= (sq->n + 1);

    /* copy the index to the header */
    strcpy(hdr_ptr, buffer);
    hdr_ptr += 10;
    hdr_size -= 10;

    /* increment the buffer string */
    ++buffer[8];
    for (i = 8; i > 0; --i) {
      if (buffer[i] > '9') {
        buffer[i] = '0';
        buffer[i-1]++;
      }
    }

    esl_sq_Reuse(sq);
    ++inx;    
  }
  if (status != eslEOF) { printf("Unexpected error %d at %d\n", status, inx); return status; }

  if (inx != seq_cnt) { printf("inx:: %d %d\n", inx, seq_cnt); return eslEFORMAT; }
  if (hdr_size != 0)  { printf("inx:: %d hdr %d\n", inx, (int)hdr_size); return eslEFORMAT; }
  if (res_size != 1)  { printf("inx:: %d size %d %d\n", inx, (int)sq->n + 1, (int)res_size); return eslEFORMAT; }

  /* copy the final sentinel character */
  *res_ptr++ = eslDSQ_SENTINEL;
  --res_size;

  /* sort the order of the database sequences */
  rnd = esl_randomness_CreateFast(seq_cnt);
  for (i = 0 ; i < seq_cnt; ++i) {
    rnd->x = rnd->x * 69069 + 1;
    cache->list[i].idx = rnd->x;
  }
  esl_randomness_Destroy(rnd);
  qsort(cache->list, seq_cnt, sizeof(HMMER_SEQ), sort_seq);

  /* fill in the different databases and fix the index */
  for (i = 0 ; i < seq_cnt; ++i) {
    inx = 0;
    db_key = cache->list[i].db_key;
    while (db_key) {
      if (db_key & 1) {
        SEQ_DB *db = cache->db + inx;
        if (db_inx[inx] >= db->count) { printf("sort:: %d %d\n", db_inx[inx], db->count); return eslEFORMAT; }
        db->list[db_inx[inx]] = &cache->list[i];
        ++db_inx[inx];
      }
      db_key >>= 1;
      ++inx;
    }
    cache->list[i].idx = (cache->list[i].name - cache->header_mem) / 10 + 1;
  }

  for (i = 0; i < cache->db_cnt; ++i) {
    printf("sequence database (%d):: %d %d\n", i, cache->db[i].count, db_inx[i]);
  }

  printf("\nLoaded sequence db file %s; total memory %" PRId64 "\n", seqfile, total_mem);

  esl_sqfile_Close(sqfp);
  esl_sq_Destroy(sq);

  *ret_cache = cache;

  return eslOK;

 ERROR:
  if (sq    != NULL) esl_sq_Destroy(sq);
  if (abc   != NULL) esl_alphabet_Destroy(abc);
  if (cache != NULL) {
    if (cache->header_mem  != NULL) free(cache->header_mem);
    if (cache->residue_mem != NULL) free(cache->residue_mem);
    if (cache->name        != NULL) free(cache->name);
    if (cache->id          != NULL) free(cache->id);
    free(cache);
  }
  for (i = 0; i < db_cnt; ++i) {
    if (db[i].list != NULL) free(db[i].list);
  }
  return eslEMEM;
}
Beispiel #11
0
/* Create an SSI index file for open sequence file <sqfp>.
 * Both name and accession of sequences are stored as keys.
 */
static void
create_ssi_index(ESL_GETOPTS *go, ESL_SQFILE *sqfp)
{
  ESL_NEWSSI *ns      = NULL;
  ESL_SQ     *sq      = esl_sq_Create();
  int         nseq    = 0;
  char       *ssifile = NULL;
  uint16_t    fh;
  int         status;

  esl_strdup(sqfp->filename, -1, &ssifile);
  esl_strcat(&ssifile, -1, ".ssi", 4);
  status = esl_newssi_Open(ssifile, TRUE, &ns); /* TRUE is for allowing overwrite. */
  if      (status == eslENOTFOUND)   esl_fatal("failed to open SSI index %s", ssifile);
  else if (status == eslEOVERWRITE)  esl_fatal("SSI index %s already exists; delete or rename it", ssifile); /* won't happen, see TRUE above... */
  else if (status != eslOK)          esl_fatal("failed to create a new SSI index");

  if (esl_newssi_AddFile(ns, sqfp->filename, sqfp->format, &fh) != eslOK)
    esl_fatal("Failed to add sequence file %s to new SSI index\n", sqfp->filename);

  printf("Creating SSI index for %s...    ", sqfp->filename); 
  fflush(stdout);
  
  while ((status = esl_sqio_ReadInfo(sqfp, sq)) == eslOK)
    {
      nseq++;
      if (sq->name == NULL) esl_fatal("Every sequence must have a name to be indexed. Failed to find name of seq #%d\n", nseq);

      if (esl_newssi_AddKey(ns, sq->name, fh, sq->roff, sq->doff, sq->L) != eslOK)
	esl_fatal("Failed to add key %s to SSI index", sq->name);

      if (sq->acc[0] != '\0') {
	if (esl_newssi_AddAlias(ns, sq->acc, sq->name) != eslOK)
	  esl_fatal("Failed to add secondary key %s to SSI index", sq->acc);
      }
      esl_sq_Reuse(sq);
    }
  if      (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n",
					   sqfp->filename, esl_sqfile_GetErrorBuf(sqfp));
  else if (status != eslEOF)     esl_fatal("Unexpected error %d reading sequence file %s",
					    status, sqfp->filename);

  /* Determine if the file was suitable for fast subseq lookup. */
  if (sqfp->data.ascii.bpl > 0 && sqfp->data.ascii.rpl > 0) {
    if ((status = esl_newssi_SetSubseq(ns, fh, sqfp->data.ascii.bpl, sqfp->data.ascii.rpl)) != eslOK) 
      esl_fatal("Failed to set %s for fast subseq lookup.");
  }

  /* Save the SSI file to disk */
  if (esl_newssi_Write(ns) != eslOK)  esl_fatal("Failed to write keys to ssi file %s\n", ssifile);

  /* Done - output and exit. */
  printf("done.\n");
  if (ns->nsecondary > 0) 
    printf("Indexed %d sequences (%ld names and %ld accessions).\n", nseq, (long) ns->nprimary, (long) ns->nsecondary);
  else 
    printf("Indexed %d sequences (%ld names).\n", nseq, (long) ns->nprimary);
  printf("SSI index written to file %s\n", ssifile);

  free(ssifile);
  esl_sq_Destroy(sq);
  esl_newssi_Close(ns);
  return;
}