Esempio n. 1
0
/* Step 2. Extract the training set and test set.
 */
static int
separate_sets(struct cfg_s *cfg, ESL_MSA *msa, ESL_MSA **ret_trainmsa, ESL_STACK **ret_teststack)
{      
  ESL_MSA   *trainmsa  = NULL;
  ESL_MSA   *test_msa  = NULL;
  ESL_STACK *teststack = NULL;
  ESL_SQ    *sq        = NULL;
  int *assignment = NULL;
  int *nin        = NULL;
  int *useme      = NULL;
  int  nc         = 0;
  int  c;
  int  ctrain;			/* index of the cluster that becomes the training alignment */
  int  ntrain;			/* number of seqs in the training alignment */
  int  nskip;
  int  i;
  int  status;

  if ((teststack = esl_stack_PCreate()) == NULL) { status = eslEMEM; goto ERROR; }
  ESL_ALLOC(useme, sizeof(int) * msa->nseq);

  if ((status = esl_msacluster_SingleLinkage(msa, cfg->idthresh1, &assignment, &nin, &nc)) != eslOK) goto ERROR;
  ctrain = esl_vec_IArgMax(nin, nc);
  ntrain = esl_vec_IMax(nin, nc);

  for (i = 0; i < msa->nseq; i++) useme[i] = (assignment[i] == ctrain) ? 1 : 0;
  if ((status = esl_msa_SequenceSubset(msa, useme, &trainmsa)) != eslOK) goto ERROR;

  /* If all the seqs went into the training msa, none are left for testing; we're done here */
  if (trainmsa->nseq == msa->nseq) {
    free(useme);
    free(assignment);
    free(nin);
    *ret_trainmsa  = trainmsa;
    *ret_teststack = teststack;
    return eslOK;
  }

  /* Put all the other sequences into an MSA of their own; from these, we'll
   * choose test sequences.
   */
  for (i = 0; i < msa->nseq; i++) useme[i] = (assignment[i] != ctrain) ? 1 : 0;
  if ((status = esl_msa_SequenceSubset(msa, useme, &test_msa))                             != eslOK) goto ERROR;

  /* Cluster those test sequences. */
  free(nin);         nin        = NULL;
  free(assignment);  assignment = NULL;
  if ((status = esl_msacluster_SingleLinkage(test_msa, cfg->idthresh2, &assignment, &nin, &nc)) != eslOK) goto ERROR;
  for (c = 0; c < nc; c++)
    {
      nskip = esl_rnd_Roll(cfg->r, nin[c]); /* pick a random seq in this cluster to be the test. */
      for (i=0; i < test_msa->nseq; i++)
	if (assignment[i] == c) {
	  if (nskip == 0) {
	    esl_sq_FetchFromMSA(test_msa, i, &sq);
	    esl_stack_PPush(teststack, (void *) sq);
	    break;
	  } else nskip--;
	}
    }

  esl_msa_Destroy(test_msa);
  free(useme);
  free(nin);
  free(assignment);

  *ret_trainmsa  = trainmsa;
  *ret_teststack = teststack;
  return eslOK;

 ERROR:
  if (useme      != NULL) free(useme);
  if (assignment != NULL) free(assignment);
  if (nin        != NULL) free(nin);
  esl_msa_Destroy(trainmsa); 
  esl_msa_Destroy(test_msa); 
  while (esl_stack_PPop(teststack, (void **) &sq) == eslOK) esl_sq_Destroy(sq);
  esl_stack_Destroy(teststack);
  *ret_trainmsa  = NULL;
  *ret_teststack = NULL;
  return status;
}
Esempio n. 2
0
static int
map_alignment(const char *msafile, const P7_HMM *hmm, ESL_SQ ***ret_sq, P7_TRACE ***ret_tr, int *ret_ntot)
{
  ESL_SQ       **sq        = NULL;
  P7_TRACE     **tr        = NULL;
  ESLX_MSAFILE  *afp       = NULL;
  ESL_MSA       *msa       = NULL;
  ESL_ALPHABET  *abc       = (ESL_ALPHABET *) hmm->abc; /* removing const'ness to make compiler happy. Safe. */
  int           *matassign = NULL;
  uint32_t       chksum    = 0;
  int            i,k;
  int            status;

  status = eslx_msafile_Open(&abc, msafile, NULL, eslMSAFILE_UNKNOWN, NULL, &afp);
  if (status != eslOK) eslx_msafile_OpenFailure(afp, status);

  status = eslx_msafile_Read(afp, &msa);
  if (status != eslOK) eslx_msafile_ReadFailure(afp, status);

  if (! (hmm->flags & p7H_CHKSUM)  )  esl_fatal("HMM has no checksum. --mapali unreliable without it.");
  if (! (hmm->flags & p7H_MAP)  )     esl_fatal("HMM has no map. --mapali can't work without it.");
  esl_msa_Checksum(msa, &chksum);
  if (hmm->checksum != chksum)        esl_fatal("--mapali MSA %s isn't same as the one HMM came from (checksum mismatch)", msafile);

  ESL_ALLOC(sq, sizeof(ESL_SQ *)   * msa->nseq);
  ESL_ALLOC(tr, sizeof(P7_TRACE *) * msa->nseq);
  ESL_ALLOC(matassign, sizeof(int) * (msa->alen + 1));

  esl_vec_ISet(matassign, msa->alen+1, 0);
  for (k = 1; k <= hmm->M; k++) matassign[hmm->map[k]] = 1;

  p7_trace_FauxFromMSA(msa, matassign, p7_DEFAULT, tr);

  /* The 'faux' core traces constructed by FauxFromMSA() may contain
   * D->I and I->D transitions.  They may *only* now be passed to
   * p7_tracealign_Seqs(), which can deal with these 'illegal'
   * transitions, in order to exactly reproduce the input --mapali
   * alignment.
   */

  for (i = 0; i < msa->nseq; i++)
    esl_sq_FetchFromMSA(msa, i, &(sq[i]));
      
  *ret_ntot = msa->nseq;
  *ret_tr   = tr;
  *ret_sq   = sq;

  eslx_msafile_Close(afp);
  esl_msa_Destroy(msa);
  free(matassign);
  return eslOK;

 ERROR:
  *ret_ntot = 0;
  *ret_tr   = NULL;
  *ret_sq   = NULL;
  if (afp       != NULL) eslx_msafile_Close(afp);
  if (msa       != NULL) esl_msa_Destroy(msa);
  if (matassign != NULL) free(matassign);
  return status;
}