double baum_welch_Ntrain (Hmm *H) { int **stab; int a, b; double cP,bP; UNPACK_HMM(H); stab=declare_int (nrounds,L+1); for (a=0; a<nrounds; a++) { model2modelR(H); cP=baum_welch_train(H, a+1); viterbi (H); posterior(H); for (b=1; b<L; b++)stab[a][b]=H->VT[b]; if (a==0 || cP>bP) { dump_model (H); dump_viterbi (H); dump_posterior(H); bP=cP; } } fprintf (stderr, "---- SUMMARY: Best: %10.3f STABILITY: %.3f\n", bP,analyze_stability (stab,nrounds, L, 100)); H->P=bP; return H->P; }
typename std::enable_if<is_dataset<Dataset>::value, real_type>::type real_type conditional_log(const Dataset& ds) const { real_type result(0); for (const auto& p : ds.assignments(arguments())) { result += posterior(p.first).log(p.first) * p.second; } return result; }
real_type log(iterator_range<ObsIt> observations, iterator_range<LabIt>& labels) const { real_type result = 0; for (const auto& p : zip(observations, labels)) { result += posterior(p.first).log(p.second); } return result; }
real_type accuracy(iterator_range<ObsIt> observations, iterator_range<LabIt>& labels) const { real_type result(0); real_type weight(0); for (const auto& p : zip(observations, labels)) { result += posterior(p.first).arg_max() == p.second; weight += real_type(1); } return result / weight; }
///////////////////////////////////////////////////////////////////////////// // // // // // Decoding // // // // // ///////////////////////////////////////////////////////////////////////////// double decode (Hmm *H) { posterior (H); viterbi (H); dump_model(H); dump_viterbi(H); dump_posterior(H); return H->PP; }
int main(int argc, char ** argv) { MPI_Init(&argc, &argv); QUESO::FullEnvironment env(MPI_COMM_WORLD, "", "", NULL); QUESO::VectorSpace<QUESO::GslVector, QUESO::GslMatrix> paramSpace(env, "space_", 1, NULL); QUESO::GslVector minBound(paramSpace.zeroVector()); minBound[0] = -10.0; QUESO::GslVector maxBound(paramSpace.zeroVector()); maxBound[0] = 10.0; QUESO::BoxSubset<QUESO::GslVector, QUESO::GslMatrix> domain("", paramSpace, minBound, maxBound); QUESO::UniformVectorRV<QUESO::GslVector, QUESO::GslMatrix> prior("", domain); Likelihood<QUESO::GslVector, QUESO::GslMatrix> likelihood("", domain); QUESO::GenericVectorRV<QUESO::GslVector, QUESO::GslMatrix> posterior("", domain); QUESO::StatisticalInverseProblem<QUESO::GslVector, QUESO::GslMatrix> ip("", NULL, prior, likelihood, posterior); QUESO::GslVector initialValues(paramSpace.zeroVector()); initialValues[0] = 9.0; QUESO::GslMatrix proposalCovarianceMatrix(paramSpace.zeroVector()); proposalCovarianceMatrix(0, 0) = 1.0; ip.seedWithMAPEstimator(); ip.solveWithBayesMetropolisHastings(NULL, initialValues, &proposalCovarianceMatrix); // The first sample should be the seed QUESO::GslVector first_sample(paramSpace.zeroVector()); posterior.realizer().realization(first_sample); // Looser tolerance for the derivative calculated by using a finite // difference if (std::abs(first_sample[0]) > 1e-5) { std::cerr << "seedWithMAPEstimator failed. Seed was: " << first_sample[0] << std::endl; std::cerr << "Actual seed should be 0.0" << std::endl; queso_error(); } return 0; }
void _Estep(vector<double> &expected, vector<double> &r1vec, const vector<double> &prior, const vector<int> &r, const IntegerMatrix &data, const NumericMatrix &itemtrace, const int &ncores) { const int nquad = prior.size(); const int nitems = data.ncol(); const int npat = r.size(); #ifdef SUPPORT_OPENMP if(nquad * nitems > 1000){ omp_set_num_threads(ncores); } else { omp_set_num_threads(1); } #endif #pragma omp parallel for for (int pat = 0; pat < npat; ++pat){ vector<double> posterior(nquad,1.0); for(int q = 0; q < nquad; ++q) posterior[q] = posterior[q] * prior[q]; for (int item = 0; item < nitems; ++item) if(data(pat,item)) for(int q = 0; q < nquad; ++q) posterior[q] *= itemtrace(q,item); double expd = 0.0; for(int i = 0; i < nquad; ++i) expd += posterior[i]; expected[pat] = expd; for(int q = 0; q < nquad; ++q) posterior[q] = r[pat] * posterior[q] / expd; #pragma omp critical for (int item = 0; item < nitems; ++item) if (data(pat,item)) for(int q = 0; q < nquad; ++q) r1vec[q + item*nquad] += posterior[q]; } //end main }
void _Estepbfactor(vector<double> &expected, vector<double> &r1, vector<double> &ri, const NumericMatrix &itemtrace, const vector<double> &prior, const vector<double> &Priorbetween, const vector<int> &r, const int &ncores, const IntegerMatrix &data, const IntegerMatrix &sitems, const vector<double> &Prior) { #ifdef SUPPORT_OPENMP omp_set_num_threads(ncores); #endif const int sfact = sitems.ncol(); const int nitems = data.ncol(); const int npquad = prior.size(); const int nbquad = Priorbetween.size(); const int nquad = nbquad * npquad; const int npat = r.size(); vector<double> r1vec(nquad*nitems*sfact, 0.0); #pragma omp parallel for for (int pat = 0; pat < npat; ++pat){ vector<double> L(nquad), Elk(nbquad*sfact), posterior(nquad*sfact); vector<double> likelihoods(nquad*sfact, 1.0); for (int fact = 0; fact < sfact; ++fact){ for (int item = 0; item < nitems; ++item){ if (data(pat,item) && sitems(item,fact)) for (int k = 0; k < nquad; ++k) likelihoods[k + nquad*fact] = likelihoods[k + nquad*fact] * itemtrace(k,item); } } vector<double> Plk(nbquad*sfact); for (int fact = 0; fact < sfact; ++fact){ int k = 0; for (int q = 0; q < npquad; ++q){ for (int i = 0; i < nbquad; ++i){ L[k] = likelihoods[k + nquad*fact] * prior[q]; ++k; } } vector<double> tempsum(nbquad, 0.0); for (int i = 0; i < npquad; ++i) for (int q = 0; q < nbquad; ++q) tempsum[q] += L[q + i*nbquad]; for (int i = 0; i < nbquad; ++i) Plk[i + fact*nbquad] = tempsum[i]; } vector<double> Pls(nbquad, 1.0); for (int i = 0; i < nbquad; ++i){ for(int fact = 0; fact < sfact; ++fact) Pls[i] = Pls[i] * Plk[i + fact*nbquad]; expected[pat] += Pls[i] * Priorbetween[i]; } for (int fact = 0; fact < sfact; ++fact) for (int i = 0; i < nbquad; ++i) Elk[i + fact*nbquad] = Pls[i] / Plk[i + fact*nbquad]; for (int fact = 0; fact < sfact; ++fact) for (int i = 0; i < nquad; ++i) posterior[i + nquad*fact] = likelihoods[i + nquad*fact] * r[pat] * Elk[i % nbquad + fact*nbquad] / expected[pat]; #pragma omp critical for (int i = 0; i < nbquad; ++i) ri[i] += Pls[i] * r[pat] * Priorbetween[i] / expected[pat]; for (int item = 0; item < nitems; ++item) if (data(pat,item)) for (int fact = 0; fact < sfact; ++fact) for(int q = 0; q < nquad; ++q) r1vec[q + fact*nquad*nitems + nquad*item] += posterior[q + fact*nquad]; } //end main for (int item = 0; item < nitems; ++item) for (int fact = 0; fact < sfact; ++fact) if(sitems(item, fact)) for(int q = 0; q < nquad; ++q) r1[q + nquad*item] = r1vec[q + nquad*item + nquad*nitems*fact] * Prior[q]; }
/** * Create an FST based on an RNN */ void FlatBOFstBuilder::convertRNN(CRnnLM & rnnlm, VectorFst<LogArc> &fst) { queue<NeuronFstHistory> q; VectorFst<LogArc> new_fst; NeuronFstHistory fsth(rnnlm.getHiddenLayerSize(),getNumBins()); FstIndex id = 0; NeuronFstHistory new_fsth(rnnlm.getHiddenLayerSize(),getNumBins()); FstIndex new_id; NeuronFstHistory min_backoff(rnnlm.getHiddenLayerSize(),getNumBins()); set<NeuronFstHistory>set_min_backoff; NeuronFstHistory bo_fsth(rnnlm.getHiddenLayerSize(),getNumBins()); bool backoff = false; vector<FstIndex> deleted; real p = 0.00; real p_joint = 0.00; real entropy = 0.0; real delta = 0.0; vector<real> all_prob(rnnlm.getVocabSize()); vector<real> posterior(10); map< FstIndex,set<FstIndex> > pred; vector<bool> non_bo_pred(rnnlm.getVocabSize()); vector<int> to_be_added; vector<int> to_be_removed; for (int i = 0; i < rnnlm.getVocabSize(); i++) { to_be_removed.push_back(i); } vector<real> to_be_added_prob; FstIndex n_added = 0; FstIndex n_processed = 0; FstIndex next_n_added = 0; FstIndex next_n_processed = 0; FstIndex n_backoff = 0; FstIndex n_only_backoff = 0; int v = rnnlm.getVocabSize(); int w = 0; // Initialize rnnlm.copyHiddenLayerToInput(); // printNeurons(rnnlm.getInputLayer(),0,10); // Initial state ( 0 | hidden layer after </s>) printNeurons(rnnlm.getHiddenLayer(),0,10); fsth.setFstHistory(rnnlm, *dzer); fsth.setLastWord(0); q.push(fsth); addFstState(id, new NeuronFstHistory(fsth), fst); fst.SetStart(INIT_STATE); // Final state (don't care about the associated discrete representation) fst.AddState(); fst.SetFinal(FINAL_STATE, LogWeight::One()); /*posterior.at(INIT_STATE) = MY_LOG_ONE;*/ min_backoff.setLastWord(-1); computeEntropyAndConditionals(entropy, all_prob, rnnlm, min_backoff); min_backoff = getBackoff(rnnlm, min_backoff, set_min_backoff, all_prob, to_be_removed); cout << "MIN BACKOFF " << min_backoff.toString() << endl; set_min_backoff.insert(min_backoff); // addFstState(id, min_backoff, fst); // q.push(min_backoff); // Estimate number of backoff loop to bound the backoff path length // float ratioa = 0.0; // float ratiob = 0.0; float ratio = 0.0; // for (int i=0; i < min_backoff.getNumDims(); i++) { // if (min_backoff.getDim(i) == 1) { // ratioa++; // } // if (fsth.getDim(i) == 1) { // ratiob++; // } // } // ratioa /= min_backoff.getNumDims(); // ratiob /= min_backoff.getNumDims(); // ratio = (ratioa*(1.0-ratiob))+(ratiob*(1.0-ratioa)); ratio=1.0; // printf("ratio=%f\t%i BO loops\n", ratio, n_bo_loops); //foreach state in the queue while (!q.empty()) { fsth = q.front(); q.pop(); id = h2state[&fsth]; state2h.push_back(new NeuronFstHistory(fsth)); if (id == FINAL_STATE) { continue; } dprintf(1,"-- STUDY STATE %li = %s\n", id, fsth.toString().c_str()); /* try { posterior.at(id) = MY_LOG_ONE; } catch (exception e) { posterior.resize((int) (posterior.size()*1.5)+1); posterior.at(id) = MY_LOG_ONE; }*/ computeEntropyAndConditionals(entropy, all_prob, rnnlm, fsth); //compute BO in advance and check if it is a min BO node bo_fsth = getBackoff(rnnlm, fsth, set_min_backoff, all_prob, to_be_removed); if (bo_fsth == fsth) { bo_fsth = min_backoff; } //foreach w (ie, foreach word of each class c) //test if the edge has to kept or removed backoff = false; //no backoff yet since no edge has been removed for (w=0; w < rnnlm.getVocabSize(); w++) { p = all_prob[w]; /*p_joint = exp(-posterior[id]-p);*/ p_joint = exp(-p); delta = -1.0*p_joint*log2(p_joint); //accept edge if this leads to a minimum //relative gain of the entropy dprintf(2,"P = %e \tP_joint = %e \tH = %e \tDelta =%e \tDelta H = %.6f %%\n",exp(-p), p_joint, entropy, delta, 100.0*delta/entropy); if (set_min_backoff.find(fsth) != set_min_backoff.end() || (delta > pruning_threshold*entropy)) { // if ((fsth == min_backoff) || (delta > pruning_threshold*entropy)) { next_n_added++; to_be_added.push_back(w); to_be_added_prob.push_back(p); dprintf(2,"\tACCEPT [%li] -- %i (%s) / %f --> ...\t(%e > %e)\n", id, w, rnnlm.getWordString(w), p, delta, pruning_threshold*entropy); // to_be_removed.push_back(w); } //backoff else { // to_be_removed.push_back(w); backoff = true; dprintf(2,"\tPRUNE [%li] -- %i / %f --> ...\n", id, w, p); } //print if (next_n_processed % 100000 == 0) { fprintf(stderr, "\rH=%.5f / N proc'd=%li / N added=%li (%.5f %%) / N bo=%li (%.5f %%) / %li/%li Nodes (%2.1f %%) / N min BO=%i", entropy, n_processed, n_added, ((float) n_added/ (float)n_processed)*100.0, n_backoff, ((float) n_backoff/ (float)n_added)*100.0, id, id+q.size(), 100.0 - (float) (100.0*id/(id+q.size())), (int) set_min_backoff.size()); } next_n_processed++; // } } //Set a part of the new FST history new_fsth.setFstHistory(rnnlm, *dzer); //if at least one word is backing off if (backoff) { n_backoff++; if (to_be_added.size() == 0) { n_only_backoff++; } if (addFstState(new_id, new NeuronFstHistory(bo_fsth), fst)) { q.push(bo_fsth); try { non_bo_pred.at(new_id) = false; } catch (exception e) { non_bo_pred.resize(new_id+(int) (non_bo_pred.size()*0.5)+1); non_bo_pred.at(new_id) = false; } } dprintf(1,"BACKOFF\t[%li]\t(%s)\n-------\t[%li]\t(%s)\n", id, fsth.toString().c_str(), new_id, bo_fsth.toString().c_str()); fst.AddArc(id, LogArc(EPSILON, EPSILON, LogWeight::Zero(), new_id)); addPred(pred, new_id, id); } vector<real>::iterator it_p = to_be_added_prob.begin(); for (vector<int>::iterator it = to_be_added.begin(); it != to_be_added.end(); ++it) { w = *it; p = *it_p; if (w == 0) { fst.AddArc(id, LogArc(FstWord(w),FstWord(w),p,FINAL_STATE)); dprintf(1,"EDGE [%li] (%s)\n---- %i (%s) / %f -->\n---- [%li] FINAL STATE)\n\n", id, fsth.toString().c_str(), FstWord(w), rnnlm.getWordString(w), p, FINAL_STATE); } //accept edge else { new_fsth.setLastWord(w); //if sw not in the memory //then add a new state for sw in the FST and push sw in the queue if (addFstState(new_id, new NeuronFstHistory(new_fsth), fst)) { q.push(new_fsth); try { non_bo_pred.at(new_id) = true; } catch (exception e) { non_bo_pred.resize(new_id+(int) (non_bo_pred.size()*0.5)+1); non_bo_pred.at(new_id) = true; } } else { /* already exists */ } //add the edge in the FST non_bo_pred.at(new_id) = true; fst.AddArc(id, LogArc(FstWord(w),FstWord(w),p,new_id)); dprintf(1,"EDGE [%li] (%s)\n---- %i (%s) / %f -->\n---- [%li] (%s)\n\n", id, fsth.toString().c_str(), FstWord(w), rnnlm.getWordString(w), p, new_id, new_fsth.toString().c_str()); // posterior.at(new_id) += posterior[id]*p; } /*if (posterior[id]+p < LogWeight::Zero().Value()) { p_joint = exp(-posterior[id]-p); entropy -= p_joint*log2(p_joint); }*/ ++it_p; } n_added = next_n_added; n_processed = next_n_processed; //reset queues to_be_added.clear(); to_be_added_prob.clear(); // to_be_removed.clear(); } cout << endl; //compute backoff weights deleted = compactBackoffNodes(fst, pred, non_bo_pred); computeAllBackoff(fst, pred); //remove useless nodes removeStates(fst, new_fst, deleted); fst.DeleteStates(); fst = new_fst; //Fill the table of symbols SymbolTable dic("dictionnary"); dic.AddSymbol("*", 0); for (int i=0; i<rnnlm.getVocabSize(); i++) { dic.AddSymbol(string(rnnlm.getWordString(i)), i+1); } fst.SetInputSymbols(&dic); fst.SetOutputSymbols(&dic); //printf("H=%.5f / N proc'd=%li / N added=%li (%.5f %%) %li/%li Nodes (%2.1f %%)\n", entropy, n_processed, n_added, ((float) n_added/ (float)n_processed)*100.0, id, id+q.size(), 100.0 - (float) (100.0*id/(id+q.size()))); cout << "END" << endl; }
match *find_best_match(const adapter_array *aa, const char *read, float *p_quals, float prior, float p_match, int min_l) { /* Take an adapter array, and check the read against all adapters. Brute force string matching is used. This is to avoid approximate matching algorithms which required an a priori specified number mismatches. */ match *best_match=NULL; int i, shift, max_shift, found_contam=0; int *best_arr=NULL, best_adapter=0, best_length=0, best_shift=0, best_score=INT_MIN; int al, curr_score, *curr_arr=NULL; int rl = strlen(read); posterior_set *ps=NULL; float *best_p_quals=NULL; max_shift = rl - min_l; for (shift = 0; shift < max_shift; shift++) { for (i = 0; i < aa->n; i++) { if (min_l >= aa->adapters[i].length) { fprintf(stderr, "Minimum match length (option -n) greater than or " \ "equal to length of adapter.\n"); exit(EXIT_FAILURE); } al = min(aa->adapters[i].length, strlen(&(read)[shift])); curr_arr = score_sequence(&(read)[shift], (aa->adapters[i]).seq, al); curr_score = sum(curr_arr, al); if (curr_score > best_score) { best_score = curr_score; best_length = al; best_shift = shift; best_p_quals = &(p_quals)[shift]; best_arr = curr_arr; best_adapter = i; ps = posterior(best_arr, best_p_quals, prior, 0.25, best_length); found_contam = ps->is_contam; if (found_contam) { break; } else { free(ps); ps=NULL; free(best_arr); } } else free(curr_arr); } if (found_contam) break; } if (!found_contam) /* no match found */ return NULL; /* save this match */ best_match = xmalloc(sizeof(match)); best_match->match = best_arr; best_match->shift = best_shift; best_match->length = best_length; best_match->ps = ps; best_match->score = best_score; best_match->adapter_index = best_adapter; best_match->p_quals = best_p_quals; best_match->match_pos = calloc(best_length, sizeof(int)); for (i = 0; i < best_length; i++) best_match->match_pos[i] = best_match->match[i] == MATCH_SCORE; return best_match; }