word DecodingLayer::initTokensWord(string dictWord) { word ret; ret.label = dictWord; int S = 2*dictWord.size() + 1; for(int i = 0; i < S+2; ++i) //S+2 because I need the special indices: -1 (output token) and 0 (input token) { vector<token> vecTok; //a row of the matrix -- for a specific segment _i_ for(int j = 0; j < T; ++j) { token tok; tok.score = logZero; //init all with ln 0 = -inf vecTok.push_back(tok); //(-inf, empty-set) } ret.tok.push_back(vecTok); //add the row to the matrix } //tok(w, s=1, t=1) ret.tok[2][0].score = safe_log( y(0, alphabet[' ']) ); ret.tok[2][0].history.push_back(dictWord); //tok(w, 2, 1) ret.tok[3][0].score = safe_log( y(0, alphabet[dictWord[0]]) ); ret.tok[3][0].history.push_back(dictWord); if(dictWord.size() == 1) //init tok(w, -1, 1) ret.tok[1][0] = ret.tok[3][0]; else { token tok; tok.score = logZero; ret.tok[1][0] = tok; } return ret; }
// This is Equation 2 (tranformed to log) from // A statistical framework for SNP calling ... , Heng Li, Bioinformatics, 2011 // http://bioinformatics.oxfordjournals.org/content/27/21/2987.full double Caller::genotype_log_likelihood(const BasePileup& bp, const vector<pair<int, int> >& base_offsets, double g, char first, char second) { double m = 2.; // always assume two alleles double log_likelihood = log(0.25); // 1 / m^2, where m = ploidy = 2; const string& bases = bp.bases(); const string& quals = bp.qualities(); double perr; for (int i = 0; i < base_offsets.size(); ++i) { char base = Pileups::extract_match(bp, base_offsets[i].first); char qual = base_offsets[i].second >= 0 ? quals[base_offsets[i].second] : _default_quality; perr = phred2prob(qual); if (base == first) { log_likelihood += safe_log((m - g) * perr + g * (1. - perr)); } else if (base == second) { log_likelihood += safe_log((m - g) * (1. - perr) + g * perr); } else { log_likelihood += safe_log(perr * perr); } } return log_likelihood; }
/* LOG_GAMMA 03nov94 wmt: move into separate file 19dec94 wmt: return double rather than float */ double log_gamma( double x, int low_precision) { if (x > 3.0) { if (low_precision == TRUE) return(((x - 0.5) * (safe_log(x))) + (-1.0 * x) + 0.9189385332046727 + /* log(sqrt(2*pi)) */ (0.08333333333333333 / x) + /* (1/12) / x */ /* -(1/360 / x^3) */ (-1.0 * (0.002777777777777778 / (x*x*x)))); else return(((x - 0.5) * (safe_log(x))) + (-1.0 * x) + 0.9189385332046727 + /* log(sqrt(2*pi)) */ (0.08333333333333333 / x) + /* (1/12) / x */ /* -(1/360 / x^3) */ (-1.0 * (0.002777777777777778 / (x*x*x))) + (0.00007936507936507937 / (x*x*x*x*x)) + /* (1/1260 / x^5) */ /* -(1/1680 / x^7) */ (0.00005952380952380953 / pow( x, 7))); } if ((x == 1.0) || (x == 2.0)) return(0.0); if (x > 0.0) return(log_gamma( 3.0 + x, low_precision ) - safe_log((double) (x * (1.0 + x) * (2.0 + x)))); fprintf( stderr, "Attempted to take log_gamma %20.15f\n", x); return 0.0; /*this is not any good but must return something*/ }
void maximization(llna_model* model, llna_ss* ss) { int i, j; double sum; // mean maximization for (i = 0; i < model->k-1; i++) vset(model->mu, i, vget(ss->mu_ss, i) / ss->ndata); // covariance maximization for (i = 0; i < model->k-1; i++) { for (j = 0; j < model->k-1; j++) { mset(model->cov, i, j, (1.0 / ss->ndata) * (mget(ss->cov_ss, i, j) + ss->ndata * vget(model->mu, i) * vget(model->mu, j) - vget(ss->mu_ss, i) * vget(model->mu, j) - vget(ss->mu_ss, j) * vget(model->mu, i))); } } if (PARAMS.cov_estimate == SHRINK) { cov_shrinkage(model->cov, ss->ndata, model->cov); } matrix_inverse(model->cov, model->inv_cov); model->log_det_inv_cov = log_det(model->inv_cov); // topic maximization for (i = 0; i < model->k; i++) { sum = 0; for (j = 0; j < model->log_beta->size2; j++) sum += mget(ss->beta_ss, i, j); if (sum == 0) sum = safe_log(sum) * model->log_beta->size2; else sum = safe_log(sum); for (j = 0; j < model->log_beta->size2; j++) mset(model->log_beta, i, j, safe_log(mget(ss->beta_ss, i, j)) - sum); } }
Caller::Caller(VG* graph, double het_prior, int min_depth, int max_depth, int min_support, double min_frac, double min_likelihood, bool leave_uncalled, int default_quality): _graph(graph), _het_log_prior(safe_log(het_prior)), _hom_log_prior(safe_log(.5 * (1. - het_prior))), _min_depth(min_depth), _max_depth(max_depth), _min_support(min_support), _min_frac(min_frac), _min_log_likelihood(safe_log(min_likelihood)), _leave_uncalled(leave_uncalled), _default_quality(default_quality) { _max_id = _graph->max_node_id(); }
/* * returns the element randomly sampled from the log * probabilities in array (number is the number of elements) */ int log_sample(double* vals, int length) { double normalizer = safe_log(0.0); int ii; for (ii = 0; ii < length; ++ii) { normalizer = log_sum(normalizer, vals[ii]); } double val = 0, sum = 0, cutoff = (double)rand() / ((double)RAND_MAX + 1.0); for (ii = 0; ii < length; ++ii) { val = exp(vals[ii] - normalizer); sum += val; if (sum >= cutoff) break; } assert(ii < length); return ii; }
int log_vector_sample(std::vector<double> vals, int length) { double normalizer = safe_log(0.0); int ii = 0; assert(length > 0 && length <= (int)vals.size()); for (ii = 0; ii < length; ++ii) { normalizer = log_sum(normalizer, vals[ii]); } double val = 0, sum = 0, cutoff = (double)rand() / ((double)RAND_MAX + 1.0); for (ii = 0; ii < length; ++ii) { val = exp(vals[ii] - normalizer); sum += val; if (sum >= cutoff) break; } assert(ii < length); return ii; }
void vct_log(vct* x) { size_t size = x->size(); for (size_t i = 0; i < size; ++i) x->at(i) = safe_log(x->at(i)); }
vector<string> DecodingLayer::getDecodedLabels() { int nbWords = words.size(); for(int t = 1; t < T; ++t) { token highestOutputToken = getHighestScoreOutputToken(t); for(int i = 0; i < nbWords; ++i) { words[i].tok[0][t] = highestOutputToken; words[i].tok[0][t].history.push_back(words[i].label); //add w to tok(w, 0, t) history string w_prime = createExtendedLabel(words[i].label); int S = w_prime.size(); for(int s = 0; s < S; ++s) { // vector<token> P; //don't used -- compute maxTok directly token maxTok = words[i].tok[s+2][t-1]; // P.push_back(words[i].tok[s+2][t-1]); // P.push_back(words[i].tok[s+1][t-1]); int prevSeg = //(s == 0) ? 0 : s+1; // s+1 is different for the first segment !! better results without condition if(words[i].tok[prevSeg][t-1].score > maxTok.score) maxTok = words[i].tok[prevSeg][t-1]; if(w_prime[s] != ' ' && s >= 2 && w_prime[s-2] != w_prime[s]) { // P.push_back(words[i].tok[s][t-1]); if(words[i].tok[s][t-1].score > maxTok.score) maxTok = words[i].tok[s][t-1]; } words[i].tok[s+2][t] = maxTok; //highest scoring token from set P words[i].tok[s+2][t].score += safe_log( y(t, alphabet[w_prime[s]]) ); } //compute the highest score token maxTok = words[i].tok[S + 1][t]; if(words[i].tok[S][t].score > maxTok.score) maxTok = words[i].tok[S][t]; words[i].tok[1][t] = maxTok; } } //output the top 10 bestwords sortVector(words); token maxTok = words[0].tok[1][T-1]; string bestword = words[0].label; cout << "+++ "; for(int i = 0; i < maxTok.history.size(); ++i) cout << maxTok.history[i] << " "; cout << "++++\n"; vector<string> result; for(int i = 0; i < 10; ++i) result.push_back(words[i].label); return result;//maxTok.history; }
void vct_log(gsl_vector* v) { for (unsigned int i = 0; i < v->size; i++) { vset(v, i, safe_log(vget(v, i))); } }
void viterbi(int *x_T, int *x_N, double *x_A, double *x_Pi, double *mu, double *sigma, double *obs, int *overlap, double *overlaps, int *overlap_ids, int *no_overlaps, int *start_overlaps, int *dist, int *L, int *distance, double *P, int *Q, double *mean_ref, double *sd_min, double *mean_sd, int *prior, double *x_W_A, double *W_Pi) { int N = *x_N; int T = *x_T; double A[N][N]; double W_A[N][N]; double Pi[N]; double delta[T][N]; int psi[T][N]; // Fill A and Pi for (int i = 0; i < N; i++) { for (int j = 0, index = i; j < N; j++, index += N) { if (*dist) A[i][j] = x_A[index]; else A[i][j] = safe_log(x_A[index]); W_A[i][j] = x_W_A[index]; } Pi[i] = safe_log(x_Pi[i]); } // Initialization for (int i = 0; i < N; i++) { delta[0][i] = emission_prob(obs[0], mu[i], sigma[i], 1) + Pi[i]; } // Recursion int no_olaps; int start; double sum_olap; double trans; if (T > 1) { for (int t = 1; t < T; t++) { no_olaps = no_overlaps[t]; int olap_ids[no_olaps]; double olaps[no_olaps]; start = start_overlaps[t]; sum_olap = 1.0; if (*overlap) { for (int i = 0; i < no_olaps; i++) { olap_ids[i] = overlap_ids[start + i]; olaps[i] = overlaps[start + i]; sum_olap += olaps[i]; } olaps[no_olaps-1] = 1.0; } for (int j = 0; j < N; j++) { double prev[N]; if (*dist) prev[0] = safe_log(trans_dist(distance[t], A[0][j], *L, N)) + delta[t-1][0]; else prev[0] = A[0][j] + delta[t-1][0]; double max = prev[0]; int maxid = 0; if (N > 1) { for (int i = 1; i < N; i++) { if (*dist) prev[i] = safe_log(trans_dist(distance[t], A[i][j], *L, N)) + delta[t-1][i]; else prev[i] = A[i][j] + delta[t-1][i]; if (prev[i] > max) { maxid = i; max = prev[i]; } } } psi[t][j] = maxid; trans = 0.0; if (*overlap) { int qt[no_olaps]; if (no_olaps > 1) { int q = j; int iter = no_olaps-2; for (int i = t-1; i >= olap_ids[0]; i--) { q = psi[i+1][q]; if (member(i, olap_ids, no_olaps)) { qt[iter] = q; iter--; } } } qt[no_olaps-1] = j; int id; for (int i = 0; i < no_olaps; i++) { id = qt[i]; trans += emission_prob(obs[t], mu[id], sigma[id], 1) + safe_log(olaps[i] / sum_olap); } } else { trans = emission_prob(obs[t], mu[j], sigma[j], 1); } if (*dist) delta[t][j] = delta[t-1][psi[t][j]] + safe_log(trans_dist(distance[t], A[psi[t][j]][j], *L, N)) + trans; else delta[t][j] = delta[t-1][psi[t][j]] + A[psi[t][j]][j] + trans; } } } // Termination double max = delta[T-1][0]; int maxid = 0; if (N > 1) { for (int i = 1; i < N; i++) { if (delta[T-1][i] > max) { maxid = i; max = delta[T-1][i]; } } } Q[T-1] = maxid; *P = delta[T-1][Q[T-1]]; // Calculate parameter prior probability if (*prior) { for (int i = 0; i < N; i++) { *P += safe_log(Dirichlet(A[i], W_A[i], N)); *P += safe_log(*sd_min / sigma[i]) + emission_prob(mu[i], mean_ref[i], *mean_sd, 1); } *P += safe_log(Dirichlet(Pi, W_Pi, N)); } // Path backtracking if (T > 1) { for (int t = T-2; t >= 0; t--) { Q[t] = psi[t+1][Q[t+1]]; } } }