/* Function: P7PriorifyHMM() * * Purpose: Add pseudocounts to an HMM using Dirichlet priors, * and renormalize the HMM. * * Args: hmm -- the HMM to add counts to (counts form) * pri -- the Dirichlet prior to use * * Return: (void) * HMM returns in probability form. */ void P7PriorifyHMM(struct plan7_s *hmm, struct p7prior_s *pri) { int k; /* counter for model position */ float d; /* a denominator */ /* Model-dependent transitions are handled simply; Laplace. */ FSet(hmm->begin+2, hmm->M-1, 0.); /* wipe internal BM entries */ FSet(hmm->end+1, hmm->M-1, 0.); /* wipe internal ME exits */ d = hmm->tbd1 + hmm->begin[1] + 2.; hmm->tbd1 = (hmm->tbd1 + 1.)/ d; hmm->begin[1] = (hmm->begin[1] + 1.)/ d; hmm->end[hmm->M] = 1.0; /* Main model transitions and emissions */ for (k = 1; k < hmm->M; k++) { P7PriorifyTransitionVector(hmm->t[k], pri); P7PriorifyEmissionVector(hmm->mat[k], pri, pri->mnum, pri->mq, pri->m, NULL); P7PriorifyEmissionVector(hmm->ins[k], pri, pri->inum, pri->iq, pri->i, NULL); } P7PriorifyEmissionVector(hmm->mat[hmm->M], pri, pri->mnum, pri->mq, pri->m, NULL); Plan7Renormalize(hmm); }
/* Function: Plan9toPlan7() * * Purpose: Convert an old HMM into Plan7. Configures it in * ls mode. * * Args: hmm - old ugly plan9 style HMM * ret_plan7 - new wonderful Plan7 HMM * * Return: (void) * Plan7 HMM is allocated here. Free w/ FreePlan7(). */ void Plan9toPlan7(struct plan9_s *hmm, struct plan7_s **ret_plan7) { struct plan7_s *plan7; int k, x; plan7 = AllocPlan7(hmm->M); for (k = 1; k < hmm->M; k++) { plan7->t[k][TMM] = hmm->mat[k].t[MATCH]; plan7->t[k][TMD] = hmm->mat[k].t[DELETE]; plan7->t[k][TMI] = hmm->mat[k].t[INSERT]; plan7->t[k][TDM] = hmm->del[k].t[MATCH]; plan7->t[k][TDD] = hmm->del[k].t[DELETE]; plan7->t[k][TIM] = hmm->ins[k].t[MATCH]; plan7->t[k][TII] = hmm->ins[k].t[INSERT]; } for (k = 1; k <= hmm->M; k++) for (x = 0; x < Alphabet_size; x++) plan7->mat[k][x] = hmm->mat[k].p[x]; for (k = 1; k < hmm->M; k++) for (x = 0; x < Alphabet_size; x++) plan7->ins[k][x] = hmm->ins[k].p[x]; plan7->tbd1 = hmm->mat[0].t[DELETE] / (hmm->mat[0].t[DELETE] + hmm->mat[0].t[MATCH]); /* We have to make up the null transition p1; use default */ P7DefaultNullModel(plan7->null, &(plan7->p1)); for (x = 0; x < Alphabet_size; x++) plan7->null[x] = hmm->null[x]; if (hmm->name != NULL) Plan7SetName(plan7, hmm->name); if (hmm->flags & HMM_REF) { strcpy(plan7->rf, hmm->ref); plan7->flags |= PLAN7_RF; } if (hmm->flags & HMM_CS) { strcpy(plan7->cs, hmm->cs); plan7->flags |= PLAN7_CS; } Plan7LSConfig(plan7); /* configure specials for ls-style alignment */ Plan7Renormalize(plan7); /* mainly to correct for missing ID and DI */ plan7->flags |= PLAN7_HASPROB; /* probabilities are valid */ plan7->flags &= ~PLAN7_HASBITS; /* scores are not valid */ *ret_plan7 = plan7; }
/* Function: P7PriorifyHMM() * * Purpose: Add pseudocounts to an HMM using Dirichlet priors, * and renormalize the HMM. * * Args: hmm -- the HMM to add counts to (counts form) * pri -- the Dirichlet prior to use * * Return: (void) * HMM returns in probability form. */ void P7PriorifyHMM(struct plan7_s *hmm, struct p7prior_s *pri) { int k; /* counter for model position */ float d; /* a denominator */ float tq[MAXDCHLET]; /* prior distribution over mixtures */ float mq[MAXDCHLET]; /* prior distribution over mixtures */ float iq[MAXDCHLET]; /* prior distribution over mixtures */ /* Model-dependent transitions are handled simply; Laplace. */ FSet(hmm->begin+2, hmm->M-1, 0.); /* wipe internal BM entries */ FSet(hmm->end+1, hmm->M-1, 0.); /* wipe internal ME exits */ d = hmm->tbd1 + hmm->begin[1] + 2.; hmm->tbd1 = (hmm->tbd1 + 1.)/ d; hmm->begin[1] = (hmm->begin[1] + 1.)/ d; hmm->end[hmm->M] = 1.0; /* Main model transitions and emissions */ for (k = 1; k < hmm->M; k++) { /* The following code chunk is experimental. * Collaboration with Michael Asman, Erik Sonnhammer, CGR Stockholm. * Only activated if X-PR* annotation has been used, in which * priors are overridden and a single Dirichlet component is * specified for each column (using structural annotation). * If X-PR* annotation is not used, which is usually the case, * the following code has no effect (observe how the real prior * distributions are copied into tq, mq, iq). */ if (hmm->tpri != NULL && hmm->tpri[k] >= 0) { if (hmm->tpri[k] >= pri->tnum) Die("X-PRT annotation out of range"); FSet(tq, pri->tnum, 0.0); tq[hmm->tpri[k]] = 1.0; } else FCopy(tq, pri->tq, pri->tnum); if (hmm->mpri != NULL && hmm->mpri[k] >= 0) { if (hmm->mpri[k] >= pri->mnum) Die("X-PRM annotation out of range"); FSet(mq, pri->mnum, 0.0); mq[hmm->mpri[k]] = 1.0; } else FCopy(mq, pri->mq, pri->mnum); if (hmm->ipri != NULL && hmm->ipri[k] >= 0) { if (hmm->ipri[k] >= pri->inum) Die("X-PRI annotation out of range"); FSet(iq, pri->inum, 0.0); iq[hmm->ipri[k]] = 1.0; } else FCopy(iq, pri->iq, pri->inum); /* This is the main line of the code: */ P7PriorifyTransitionVector(hmm->t[k], pri, tq); P7PriorifyEmissionVector(hmm->mat[k], pri, pri->mnum, mq, pri->m, NULL); P7PriorifyEmissionVector(hmm->ins[k], pri, pri->inum, iq, pri->i, NULL); } /* We repeat the above steps just for the final match state, M. */ if (hmm->mpri != NULL && hmm->mpri[hmm->M] >= 0) { if (hmm->mpri[hmm->M] >= pri->mnum) Die("X-PRM annotation out of range"); FSet(mq, pri->mnum, 0.0); mq[hmm->mpri[hmm->M]] = 1.0; } else FCopy(mq, pri->mq, pri->mnum); P7PriorifyEmissionVector(hmm->mat[hmm->M], pri, pri->mnum, mq, pri->m, NULL); /* Now we're done. Convert the counts-based HMM to probabilities. */ Plan7Renormalize(hmm); }
int main(int argc, char **argv) { const char *hmmfile; /* file to read HMMs from */ FILE *fp; /* output file handle */ HMMFILE *hmmfp; /* opened hmmfile for reading */ struct plan7_s *hmm; /* HMM to generate from */ int L; /* length of a sequence */ int i; /* counter over sequences */ char *ofile; /* output sequence file */ int nseq; /* number of seqs to sample */ int seed; /* random number generator seed */ int be_quiet; /* TRUE to silence header/footer */ int do_alignment; /* TRUE to output in aligned format */ int do_consensus; /* TRUE to do a single consensus seq */ AjBool ajselex; AjBool ajcons; AjPFile inf=NULL; AjPFile outf=NULL; AjPStr instr=NULL; AjPStr outstr=NULL; #ifdef MEMDEBUG unsigned long histid1, histid2, orig_size, current_size; orig_size = malloc_inuse(&histid1); fprintf(stderr, "[... memory debugging is ON ...]\n"); #endif /*********************************************** * Parse command line ***********************************************/ nseq = 10; be_quiet = FALSE; do_alignment = FALSE; do_consensus = FALSE; ofile = NULL; embInitPV("ohmmemit",argc,argv,"HMMER",VERSION); ajselex = ajAcdGetBoolean("selex"); ajcons = ajAcdGetBoolean("consensus"); nseq = ajAcdGetInt("number"); seed = ajAcdGetInt("seed"); inf = ajAcdGetInfile("infile"); outf = ajAcdGetOutfile("outfile"); if(!seed) seed = time ((time_t *) NULL); if(ajselex) do_alignment=TRUE; else do_alignment=FALSE; if(ajcons) do_consensus=TRUE; else do_consensus=FALSE; instr = ajStrNewC((char *)ajFileGetNameC(inf)); outstr = ajStrNewC((char *)ajFileGetNameC(outf)); hmmfile = ajStrGetPtr(instr); sre_srandom(seed); if (do_alignment && do_consensus) ajFatal("Sorry, -selex and -consensus are incompatible.\n"); if (nseq != 10 && do_consensus) ajWarn("-consensus overrides -number (# of sampled seqs)"); /*********************************************** * Open HMM file (might be in HMMERDB or current directory). * Read a single HMM from it. ***********************************************/ if ((hmmfp = HMMFileOpen(hmmfile, "HMMERDB")) == NULL) ajFatal("Failed to open HMM file %s\n", hmmfile); if (!HMMFileRead(hmmfp, &hmm)) ajFatal("Failed to read any HMMs from %s\n", hmmfile); HMMFileClose(hmmfp); if (hmm == NULL) ajFatal("HMM file %s corrupt or in incorrect format? Parse failed", hmmfile); /* Configure the HMM to shut off N,J,C emission: so we * do a simple single pass through the model. */ Plan7NakedConfig(hmm); Plan7Renormalize(hmm); /*********************************************** * Open the output file, or stdout ***********************************************/ fp = ajFileGetFileptr(outf); /*********************************************** * Show the options banner ***********************************************/ be_quiet=TRUE; if (! be_quiet) { printf("HMM file: %s\n", hmmfile); if (! do_consensus) { printf("Number of seqs: %d\n", nseq); printf("Random seed: %d\n", seed); } printf("- - - - - - - - - - - - - - - - - - - - - - - - - " "- - - - - - -\n\n"); } /*********************************************** * Do the work. * If we're generating an alignment, we have to collect * all our traces, then output. If we're generating unaligned * sequences, we can emit one at a time. ***********************************************/ if (do_consensus) { char *seq; SQINFO sqinfo; /* info about sequence (name/desc) */ EmitConsensusSequence(hmm, &seq, NULL, &L, NULL); strcpy(sqinfo.name, "consensus"); sqinfo.len = L; sqinfo.flags = SQINFO_NAME | SQINFO_LEN; WriteSeq(fp, kPearson, seq, &sqinfo); free(seq); } else if (do_alignment) { struct p7trace_s **tr; char **dsq; SQINFO *sqinfo; char **aseq; AINFO ainfo; float *wgt; dsq = MallocOrDie(sizeof(char *) * nseq); tr = MallocOrDie(sizeof(struct p7trace_s *) * nseq); sqinfo = MallocOrDie(sizeof(SQINFO) * nseq); wgt = MallocOrDie(sizeof(float) * nseq); FSet(wgt, nseq, 1.0); for (i = 0; i < nseq; i++) { EmitSequence(hmm, &(dsq[i]), &L, &(tr[i])); sprintf(sqinfo[i].name, "seq%d", i+1); sqinfo[i].len = L; sqinfo[i].flags = SQINFO_NAME | SQINFO_LEN; } P7Traces2Alignment(dsq, sqinfo, wgt, nseq, hmm->M, tr, FALSE, &aseq, &ainfo); /* Output the alignment */ WriteSELEX(fp, aseq, &ainfo, 50); if (ofile != NULL && !be_quiet) printf("Alignment saved in file %s\n", ofile); /* Free memory */ for (i = 0; i < nseq; i++) { P7FreeTrace(tr[i]); free(dsq[i]); } FreeAlignment(aseq, &ainfo); free(sqinfo); free(dsq); free(wgt); free(tr); } else /* unaligned sequence output */ { struct p7trace_s *tr; char *dsq; char *seq; SQINFO sqinfo; for (i = 0; i < nseq; i++) { EmitSequence(hmm, &dsq, &L, &tr); sprintf(sqinfo.name, "seq%d", i+1); sqinfo.len = L; sqinfo.flags = SQINFO_NAME | SQINFO_LEN; seq = DedigitizeSequence(dsq, L); WriteSeq(fp, kPearson, seq, &sqinfo); P7FreeTrace(tr); free(dsq); free(seq); } } ajFileClose(&outf); FreePlan7(hmm); SqdClean(); #ifdef MEMDEBUG current_size = malloc_inuse(&histid2); if (current_size != orig_size) malloc_list(2, histid1, histid2); else fprintf(stderr, "[No memory leaks.]\n"); #endif ajStrDel(&instr); ajStrDel(&outstr); ajFileClose(&inf); ajFileClose(&outf); embExit(); return 0; }