/* tag_viterbi: * This function implement the Viterbi algorithm in order to decode the most * probable sequence of labels according to the model. Some part of this code * is very similar to the computation of the gradient as expected. * * And like for the gradient, the caller is responsible to ensure there is * enough stack space. */ void tag_viterbi(mdl_t *mdl, const seq_t *seq, uint32_t out[], double *sc, double psc[]) { const uint32_t Y = mdl->nlbl; const uint32_t T = seq->len; double *vpsi = xvm_new(T * Y * Y); uint32_t *vback = xmalloc(sizeof(uint32_t) * T * Y); double (*psi) [T][Y][Y] = (void *)vpsi; uint32_t (*back)[T][Y] = (void *)vback; double *cur = xmalloc(sizeof(double) * Y); double *old = xmalloc(sizeof(double) * Y); // We first compute the scores for each transitions in the lattice of // labels. int op; if (mdl->type == 1) op = tag_memmsc(mdl, seq, vpsi); else if (mdl->opt->lblpost) op = tag_postsc(mdl, seq, vpsi); else op = tag_expsc(mdl, seq, vpsi); if (mdl->opt->force) tag_forced(mdl, seq, vpsi, op); // Now we can do the Viterbi algorithm. This is very similar to the // forward pass // | α_1(y) = Ψ_1(y,x_1) // | α_t(y) = max_{y'} α_{t-1}(y') + Ψ_t(y',y,x_t) // We just replace the sum by a max and as we do the computation in the // logarithmic space the product become a sum. (this also mean that we // don't have to worry about numerical problems) // // Next we have to walk backward over the α in order to find the best // path. In order to do this efficiently, we keep in the 'back' array // the indice of the y value selected by the max. This also mean that // we only need the current and previous value of the α vectors, not // the full matrix. for (uint32_t y = 0; y < Y; y++) cur[y] = (*psi)[0][0][y]; for (uint32_t t = 1; t < T; t++) { for (uint32_t y = 0; y < Y; y++) old[y] = cur[y]; for (uint32_t y = 0; y < Y; y++) { double bst = -HUGE_VAL; uint32_t idx = 0; for (uint32_t yp = 0; yp < Y; yp++) { double val = old[yp]; if (op) val *= (*psi)[t][yp][y]; else val += (*psi)[t][yp][y]; if (val > bst) { bst = val; idx = yp; } } (*back)[t][y] = idx; cur[y] = bst; } } // We can now build the sequence of labels predicted by the model. For // this we search in the last α vector the best value. Using this index // as a starting point in the back-pointer array we finally can decode // the best sequence. uint32_t bst = 0; for (uint32_t y = 1; y < Y; y++) if (cur[y] > cur[bst]) bst = y; if (sc != NULL) *sc = cur[bst]; for (uint32_t t = T; t > 0; t--) { const uint32_t yp = (t != 1) ? (*back)[t - 1][bst] : 0; const uint32_t y = bst; out[t - 1] = y; if (psc != NULL) psc[t - 1] = (*psi)[t - 1][yp][y]; bst = yp; } free(old); free(cur); free(vback); xvm_free(vpsi); }
/* tag_nbviterbi: * This function implement the Viterbi algorithm in order to decode the N-most * probable sequences of labels according to the model. It can be used to * compute only the best one and will return the same sequence than the * previous function but will be slower to do it. */ void tag_nbviterbi(mdl_t *mdl, const seq_t *seq, uint32_t N, uint32_t **out, double sc[], double **psc) { const uint32_t Y = mdl->nlbl; const uint32_t T = seq->len; double *vpsi = xvm_new(T * Y * Y); uint32_t *vback = xmalloc(sizeof(uint32_t) * T * Y * N); double (*psi) [T][Y ][Y] = (void *)vpsi; uint32_t (*back)[T][Y * N] = (void *)vback; double *cur = xmalloc(sizeof(double) * Y * N); double *old = xmalloc(sizeof(double) * Y * N); // We first compute the scores for each transitions in the lattice of // labels. int op; if (mdl->type == 1) op = tag_memmsc(mdl, seq, vpsi); else if (mdl->opt->lblpost) op = tag_postsc(mdl, seq, (double *)psi); else op = tag_expsc(mdl, seq, (double *)psi); if (mdl->opt->force) tag_forced(mdl, seq, vpsi, op); // Here also, it's classical but we have to keep the N best paths // leading to each nodes of the lattice instead of only the best one. // This mean that code is less trivial and the current implementation is // not the most efficient way to do this but it works well and is good // enough for the moment. // We first build the list of all incoming arcs from all paths from all // N-best nodes and next select the N-best one. There is a lot of room // here for later optimisations if needed. for (uint32_t y = 0, d = 0; y < Y; y++) { cur[d++] = (*psi)[0][0][y]; for (uint32_t n = 1; n < N; n++) cur[d++] = -DBL_MAX; } for (uint32_t t = 1; t < T; t++) { for (uint32_t d = 0; d < Y * N; d++) old[d] = cur[d]; for (uint32_t y = 0; y < Y; y++) { // 1st, build the list of all incoming double lst[Y * N]; for (uint32_t yp = 0, d = 0; yp < Y; yp++) { for (uint32_t n = 0; n < N; n++, d++) { lst[d] = old[d]; if (op) lst[d] *= (*psi)[t][yp][y]; else lst[d] += (*psi)[t][yp][y]; } } // 2nd, init the back with the N first uint32_t *bk = &(*back)[t][y * N]; for (uint32_t n = 0; n < N; n++) bk[n] = n; // 3rd, search the N highest values for (uint32_t i = N; i < N * Y; i++) { // Search the smallest current value uint32_t idx = 0; for (uint32_t n = 1; n < N; n++) if (lst[bk[n]] < lst[bk[idx]]) idx = n; // And replace it if needed if (lst[i] > lst[bk[idx]]) bk[idx] = i; } // 4th, get the new scores for (uint32_t n = 0; n < N; n++) cur[y * N + n] = lst[bk[n]]; } } // Retrieving the best paths is similar to classical Viterbi except that // we have to search for the N bet ones and there is N time more // possibles starts. for (uint32_t n = 0; n < N; n++) { uint32_t bst = 0; for (uint32_t d = 1; d < Y * N; d++) if (cur[d] > cur[bst]) bst = d; if (sc != NULL) sc[n] = cur[bst]; cur[bst] = -DBL_MAX; for (uint32_t t = T; t > 0; t--) { const uint32_t yp = (t != 1) ? (*back)[t - 1][bst] / N: 0; const uint32_t y = bst / N; out[t - 1][n] = y; if (psc != NULL) psc[t - 1][n] = (*psi)[t - 1][yp][y]; bst = (*back)[t - 1][bst]; } } free(old); free(cur); free(vback); xvm_free(vpsi); }
void trn_lbfgs(mdl_t *mdl) { const size_t F = mdl->nftr; const int K = mdl->opt->maxiter; const int C = mdl->opt->objwin; const int M = mdl->opt->lbfgs.histsz; const size_t W = mdl->opt->nthread; const bool l1 = mdl->opt->rho1 != 0.0; double *x, *xp; // Current and previous value of the variables double *g, *gp; // Current and previous value of the gradient double *pg; // The pseudo-gradient (only for owl-qn) double *d; // The search direction double *s[M]; // History value s_k = Δ(x,px) double *y[M]; // History value y_k = Δ(g,pg) double p[M]; // ρ_k double fh[C]; // f(x) history grd_t *grds[W]; // Initialization: Here, we have to allocate memory on the heap as we // cannot request so much memory on the stack as this will have a too // big impact on performance and will be refused by the system on non- // trivial models. x = mdl->theta; xp = xvm_new(F); g = xvm_new(F); gp = xvm_new(F); d = xvm_new(F); for (int m = 0; m < M; m++) { s[m] = xvm_new(F); y[m] = xvm_new(F); } pg = l1 ? xvm_new(F) : NULL; grds[0] = grd_new(mdl, g); for (size_t w = 1; w < W; w++) grds[w] = grd_new(mdl, xvm_new(F)); // Minimization: This is the heart of the function. (a big heart...) We // will perform iterations until one these conditions is reached // - the maximum iteration count is reached // - we have converged (upto numerical precision) // - the report function return false // - an error happen somewhere double fx = grd_gradient(mdl, g, grds); for (int k = 0; !uit_stop && k < K; k++) { // We first compute the pseudo-gradient of f for owl-qn. It is // defined in [3, pp 335(4)] // | ∂_i^- f(x) if ∂_i^- f(x) > 0 // ◇_i f(x) = | ∂_i^+ f(x) if ∂_i^+ f(x) < 0 // | 0 otherwise // with // ∂_i^± f(x) = ∂/∂x_i l(x) + | Cσ(x_i) if x_i ≠ 0 // | ±C if x_i = 0 if (l1) { const double rho1 = mdl->opt->rho1; for (unsigned f = 0; f < F; f++) { if (x[f] < 0.0) pg[f] = g[f] - rho1; else if (x[f] > 0.0) pg[f] = g[f] + rho1; else if (g[f] < -rho1) pg[f] = g[f] + rho1; else if (g[f] > rho1) pg[f] = g[f] - rho1; else pg[f] = 0.0; } } // 1st step: We compute the search direction. We search in the // direction who minimize the second order approximation given // by the Taylor series which give // d_k = - H_k^{-1} g_k // But computing the inverse of the hessian is intractable so // the l-bfgs only approximate it's diagonal. The exact // computation is well described in [1, pp 779]. // The only special thing for owl-qn here is to use the pseudo // gradient instead of the true one. xvm_neg(d, l1 ? pg : g, F); if (k != 0) { const int km = k % M; const int bnd = (k <= M) ? k : M; double alpha[M], beta; // α_i = ρ_j s_j^T q_{i+1} // q_i = q_{i+1} - α_i y_i for (int i = bnd; i > 0; i--) { const int j = (k - i + M + 1) % M; alpha[i - 1] = p[j] * xvm_dot(s[j], d, F); xvm_axpy(d, -alpha[i - 1], y[j], d, F); } // r_0 = H_0 q_0 // Scaling is described in [2, pp 515] // for k = 0: H_0 = I // for k > 0: H_0 = I * y_k^T s_k / ||y_k||² // = I * 1 / ρ_k ||y_k||² const double y2 = xvm_dot(y[km], y[km], F); const double v = 1.0 / (p[km] * y2); for (size_t f = 0; f < F; f++) d[f] *= v; // β_j = ρ_j y_j^T r_i // r_{i+1} = r_i + s_j (α_i - β_i) for (int i = 0; i < bnd; i++) { const int j = (k - i + M) % M; beta = p[j] * xvm_dot(y[j], d, F); xvm_axpy(d, alpha[i] - beta, s[j], d, F); } } // For owl-qn, we must remain in the same orthant than the // pseudo-gradient, so we have to constrain the search // direction as described in [3, pp 35(3)] // d^k = π(d^k ; v^k) // = π(d^k ; -◇f(x^k)) if (l1) for (size_t f = 0; f < F; f++) if (d[f] * pg[f] >= 0.0) d[f] = 0.0; // 2nd step: we perform a linesearch in the computed direction, // we search a step value that satisfy the constrains using a // backtracking algorithm. Much elaborated algorithm can perform // better in the general case, but for CRF training, bactracking // is very efficient and simple to implement. // For quasi-Newton, the natural step is 1.0 so we start with // this one and reduce it only if it fail with an exception for // the first step where a better guess can be done. // We have to keep track of the current point and gradient as we // will need to compute the delta between those and the found // point, and perhaps need to restore them if linesearch fail. memcpy(xp, x, sizeof(double) * F); memcpy(gp, g, sizeof(double) * F); double sc = (k == 0) ? 0.1 : 0.5; double stp = (k == 0) ? 1.0 / xvm_norm(d, F) : 1.0; double gd = l1 ? 0.0 : xvm_dot(g, d, F); // gd = g_k^T d_k double fi = fx; bool err = false; for (int ls = 1; !uit_stop; ls++, stp *= sc) { // We compute the new point using the current step and // search direction xvm_axpy(x, stp, d, xp, F); // For owl-qn, we have to project back the point in the // current orthant [3, pp 35] // x^{k+1} = π(x^k + αp^k ; ξ) if (l1) { for (size_t f = 0; f < F; f++) { double or = xp[f]; if (or == 0.0) or = -pg[f]; if (x[f] * or <= 0.0) x[f] = 0.0; } } // And we ask for the value of the objective function // and its gradient. fx = grd_gradient(mdl, g, grds); // Now we check if the step satisfy the conditions. For // l-bfgs, we check the classical decrease and curvature // known as the Wolfe conditions [2, pp 506] // f(x_k + α_k d_k) ≤ f(x_k) + β' α_k g_k^T d_k // g(x_k + α_k d_k)^T d_k ≥ β g_k^T d_k // // And for owl-qn we check a variant of the Armijo rule // described in [3, pp 36] // f(π(x^k+αp^k;ξ)) ≤ f(x^k) - γv^T[π(x^k+αp^k;ξ)-x^k] if (!l1) { if (fx > fi + stp * gd * 1e-4) sc = 0.5; else if (xvm_dot(g, d, F) < gd * 0.9) sc = 2.1; else break; } else { double vp = 0.0; for (size_t f = 0; f < F; f++) vp += (x[f] - xp[f]) * d[f]; if (fx < fi + vp * 1e-4) break; } // If we reach the maximum number of linesearsh steps // without finding a good one, we just fail. if (ls == mdl->opt->lbfgs.maxls) { warning("maximum linesearch reached"); err = true; break; } } // If linesearch failed or user interupted training, we return // to the last valid point and stop the training. The model is // probably not fully optimized but we let the user decide what // to do with it. if (err || uit_stop) { memcpy(x, xp, sizeof(double) * F); break; } if (uit_progress(mdl, k + 1, fx) == false) break; // 3rd step: we update the history used for approximating the // inverse of the diagonal of the hessian // s_k = x_{k+1} - x_k // y_k = g_{k+1} - g_k // ρ_k = 1 / y_k^T s_k const int kn = (k + 1) % M; xvm_sub(s[kn], x, xp, F); xvm_sub(y[kn], g, gp, F); p[kn] = 1.0 / xvm_dot(y[kn], s[kn], F); // And last, we check for convergence. The convergence check is // quite simple [2, pp 508] // ||g|| / max(1, ||x||) ≤ ε // with ε small enough so we stop when numerical precision is // reached. For owl-qn we just have to check against the pseudo- // gradient instead of the true one. const double xn = xvm_norm(x, F); const double gn = xvm_norm(l1 ? pg : g, F); if (gn / max(xn, 1.0) <= 1e-5) break; if (k + 1 == K) break; // Second stoping criterion tested is a check for improvement of // the function value over the past W iteration. When this come // under an epsilon, we also stop the minimization. fh[k % C] = fx; double dlt = 1.0; if (k >= C) { const double of = fh[(k + 1) % C]; dlt = fabs(of - fx) / of; if (dlt < mdl->opt->stopeps) break; } } // Cleanup: We free all the vectors we have allocated. xvm_free(xp); xvm_free(g); xvm_free(gp); xvm_free(d); for (int m = 0; m < M; m++) { xvm_free(s[m]); xvm_free(y[m]); } if (l1) xvm_free(pg); for (size_t w = 1; w < W; w++) xvm_free(grds[w]->g); for (size_t w = 0; w < W; w++) grd_free(grds[w]); }