/* Step 2. Extract the training set and test set. */ static int separate_sets(struct cfg_s *cfg, ESL_MSA *msa, ESL_MSA **ret_trainmsa, ESL_STACK **ret_teststack) { ESL_MSA *trainmsa = NULL; ESL_MSA *test_msa = NULL; ESL_STACK *teststack = NULL; ESL_SQ *sq = NULL; int *assignment = NULL; int *nin = NULL; int *useme = NULL; int nc = 0; int c; int ctrain; /* index of the cluster that becomes the training alignment */ int ntrain; /* number of seqs in the training alignment */ int nskip; int i; int status; if ((teststack = esl_stack_PCreate()) == NULL) { status = eslEMEM; goto ERROR; } ESL_ALLOC(useme, sizeof(int) * msa->nseq); if ((status = esl_msacluster_SingleLinkage(msa, cfg->idthresh1, &assignment, &nin, &nc)) != eslOK) goto ERROR; ctrain = esl_vec_IArgMax(nin, nc); ntrain = esl_vec_IMax(nin, nc); for (i = 0; i < msa->nseq; i++) useme[i] = (assignment[i] == ctrain) ? 1 : 0; if ((status = esl_msa_SequenceSubset(msa, useme, &trainmsa)) != eslOK) goto ERROR; /* If all the seqs went into the training msa, none are left for testing; we're done here */ if (trainmsa->nseq == msa->nseq) { free(useme); free(assignment); free(nin); *ret_trainmsa = trainmsa; *ret_teststack = teststack; return eslOK; } /* Put all the other sequences into an MSA of their own; from these, we'll * choose test sequences. */ for (i = 0; i < msa->nseq; i++) useme[i] = (assignment[i] != ctrain) ? 1 : 0; if ((status = esl_msa_SequenceSubset(msa, useme, &test_msa)) != eslOK) goto ERROR; /* Cluster those test sequences. */ free(nin); nin = NULL; free(assignment); assignment = NULL; if ((status = esl_msacluster_SingleLinkage(test_msa, cfg->idthresh2, &assignment, &nin, &nc)) != eslOK) goto ERROR; for (c = 0; c < nc; c++) { nskip = esl_rnd_Roll(cfg->r, nin[c]); /* pick a random seq in this cluster to be the test. */ for (i=0; i < test_msa->nseq; i++) if (assignment[i] == c) { if (nskip == 0) { esl_sq_FetchFromMSA(test_msa, i, &sq); esl_stack_PPush(teststack, (void *) sq); break; } else nskip--; } } esl_msa_Destroy(test_msa); free(useme); free(nin); free(assignment); *ret_trainmsa = trainmsa; *ret_teststack = teststack; return eslOK; ERROR: if (useme != NULL) free(useme); if (assignment != NULL) free(assignment); if (nin != NULL) free(nin); esl_msa_Destroy(trainmsa); esl_msa_Destroy(test_msa); while (esl_stack_PPop(teststack, (void **) &sq) == eslOK) esl_sq_Destroy(sq); esl_stack_Destroy(teststack); *ret_trainmsa = NULL; *ret_teststack = NULL; return status; }
static int map_alignment(const char *msafile, const P7_HMM *hmm, ESL_SQ ***ret_sq, P7_TRACE ***ret_tr, int *ret_ntot) { ESL_SQ **sq = NULL; P7_TRACE **tr = NULL; ESLX_MSAFILE *afp = NULL; ESL_MSA *msa = NULL; ESL_ALPHABET *abc = (ESL_ALPHABET *) hmm->abc; /* removing const'ness to make compiler happy. Safe. */ int *matassign = NULL; uint32_t chksum = 0; int i,k; int status; status = eslx_msafile_Open(&abc, msafile, NULL, eslMSAFILE_UNKNOWN, NULL, &afp); if (status != eslOK) eslx_msafile_OpenFailure(afp, status); status = eslx_msafile_Read(afp, &msa); if (status != eslOK) eslx_msafile_ReadFailure(afp, status); if (! (hmm->flags & p7H_CHKSUM) ) esl_fatal("HMM has no checksum. --mapali unreliable without it."); if (! (hmm->flags & p7H_MAP) ) esl_fatal("HMM has no map. --mapali can't work without it."); esl_msa_Checksum(msa, &chksum); if (hmm->checksum != chksum) esl_fatal("--mapali MSA %s isn't same as the one HMM came from (checksum mismatch)", msafile); ESL_ALLOC(sq, sizeof(ESL_SQ *) * msa->nseq); ESL_ALLOC(tr, sizeof(P7_TRACE *) * msa->nseq); ESL_ALLOC(matassign, sizeof(int) * (msa->alen + 1)); esl_vec_ISet(matassign, msa->alen+1, 0); for (k = 1; k <= hmm->M; k++) matassign[hmm->map[k]] = 1; p7_trace_FauxFromMSA(msa, matassign, p7_DEFAULT, tr); /* The 'faux' core traces constructed by FauxFromMSA() may contain * D->I and I->D transitions. They may *only* now be passed to * p7_tracealign_Seqs(), which can deal with these 'illegal' * transitions, in order to exactly reproduce the input --mapali * alignment. */ for (i = 0; i < msa->nseq; i++) esl_sq_FetchFromMSA(msa, i, &(sq[i])); *ret_ntot = msa->nseq; *ret_tr = tr; *ret_sq = sq; eslx_msafile_Close(afp); esl_msa_Destroy(msa); free(matassign); return eslOK; ERROR: *ret_ntot = 0; *ret_tr = NULL; *ret_sq = NULL; if (afp != NULL) eslx_msafile_Close(afp); if (msa != NULL) esl_msa_Destroy(msa); if (matassign != NULL) free(matassign); return status; }