Ejemplo n.º 1
0
/* tag_viterbi:
 *   This function implement the Viterbi algorithm in order to decode the most
 *   probable sequence of labels according to the model. Some part of this code
 *   is very similar to the computation of the gradient as expected.
 *
 *   And like for the gradient, the caller is responsible to ensure there is
 *   enough stack space.
 */
void tag_viterbi(mdl_t *mdl, const seq_t *seq,
	         uint32_t out[], double *sc, double psc[]) {
	const uint32_t Y = mdl->nlbl;
	const uint32_t T = seq->len;
	double   *vpsi  = xvm_new(T * Y * Y);
	uint32_t *vback = xmalloc(sizeof(uint32_t) * T * Y);
	double   (*psi) [T][Y][Y] = (void *)vpsi;
	uint32_t (*back)[T][Y]    = (void *)vback;
	double *cur = xmalloc(sizeof(double) * Y);
	double *old = xmalloc(sizeof(double) * Y);
	// We first compute the scores for each transitions in the lattice of
	// labels.
	int op;
	if (mdl->type == 1)
		op = tag_memmsc(mdl, seq, vpsi);
	else if (mdl->opt->lblpost)
		op = tag_postsc(mdl, seq, vpsi);
	else
		op = tag_expsc(mdl, seq, vpsi);
	if (mdl->opt->force)
		tag_forced(mdl, seq, vpsi, op);
	// Now we can do the Viterbi algorithm. This is very similar to the
	// forward pass
	//   | α_1(y) = Ψ_1(y,x_1)
	//   | α_t(y) = max_{y'} α_{t-1}(y') + Ψ_t(y',y,x_t)
	// We just replace the sum by a max and as we do the computation in the
	// logarithmic space the product become a sum. (this also mean that we
	// don't have to worry about numerical problems)
	//
	// Next we have to walk backward over the α in order to find the best
	// path. In order to do this efficiently, we keep in the 'back' array
	// the indice of the y value selected by the max. This also mean that
	// we only need the current and previous value of the α vectors, not
	// the full matrix.
	for (uint32_t y = 0; y < Y; y++)
		cur[y] = (*psi)[0][0][y];
	for (uint32_t t = 1; t < T; t++) {
		for (uint32_t y = 0; y < Y; y++)
			old[y] = cur[y];
		for (uint32_t y = 0; y < Y; y++) {
			double   bst = -HUGE_VAL;
			uint32_t idx = 0;
			for (uint32_t yp = 0; yp < Y; yp++) {
				double val = old[yp];
				if (op)
					val *= (*psi)[t][yp][y];
				else
					val += (*psi)[t][yp][y];
				if (val > bst) {
					bst = val;
					idx = yp;
				}
			}
			(*back)[t][y] = idx;
			cur[y]        = bst;
		}
	}
	// We can now build the sequence of labels predicted by the model. For
	// this we search in the last α vector the best value. Using this index
	// as a starting point in the back-pointer array we finally can decode
	// the best sequence.
	uint32_t bst = 0;
	for (uint32_t y = 1; y < Y; y++)
		if (cur[y] > cur[bst])
			bst = y;
	if (sc != NULL)
		*sc = cur[bst];
	for (uint32_t t = T; t > 0; t--) {
		const uint32_t yp = (t != 1) ? (*back)[t - 1][bst] : 0;
		const uint32_t y  = bst;
		out[t - 1] = y;
		if (psc != NULL)
			psc[t - 1] = (*psi)[t - 1][yp][y];
		bst = yp;
	}
	free(old);
	free(cur);
	free(vback);
	xvm_free(vpsi);
}
Ejemplo n.º 2
0
/* tag_nbviterbi:
 *   This function implement the Viterbi algorithm in order to decode the N-most
 *   probable sequences of labels according to the model. It can be used to
 *   compute only the best one and will return the same sequence than the
 *   previous function but will be slower to do it.
 */
void tag_nbviterbi(mdl_t *mdl, const seq_t *seq, uint32_t N,
                   uint32_t **out, double sc[], double **psc) {
	const uint32_t Y = mdl->nlbl;
	const uint32_t T = seq->len;
	double   *vpsi  = xvm_new(T * Y * Y);
	uint32_t *vback = xmalloc(sizeof(uint32_t) * T * Y * N);
	double   (*psi) [T][Y    ][Y] = (void *)vpsi;
	uint32_t (*back)[T][Y * N]    = (void *)vback;
	double *cur = xmalloc(sizeof(double) * Y * N);
	double *old = xmalloc(sizeof(double) * Y * N);
	// We first compute the scores for each transitions in the lattice of
	// labels.
	int op;
	if (mdl->type == 1)
		op = tag_memmsc(mdl, seq, vpsi);
	else if (mdl->opt->lblpost)
		op = tag_postsc(mdl, seq, (double *)psi);
	else
		op = tag_expsc(mdl, seq, (double *)psi);
	if (mdl->opt->force)
		tag_forced(mdl, seq, vpsi, op);
	// Here also, it's classical but we have to keep the N best paths
	// leading to each nodes of the lattice instead of only the best one.
	// This mean that code is less trivial and the current implementation is
	// not the most efficient way to do this but it works well and is good
	// enough for the moment.
	// We first build the list of all incoming arcs from all paths from all
	// N-best nodes and next select the N-best one. There is a lot of room
	// here for later optimisations if needed.
	for (uint32_t y = 0, d = 0; y < Y; y++) {
		cur[d++] = (*psi)[0][0][y];
		for (uint32_t n = 1; n < N; n++)
			cur[d++] = -DBL_MAX;
	}
	for (uint32_t t = 1; t < T; t++) {
		for (uint32_t d = 0; d < Y * N; d++)
			old[d] = cur[d];
		for (uint32_t y = 0; y < Y; y++) {
			// 1st, build the list of all incoming
			double lst[Y * N];
			for (uint32_t yp = 0, d = 0; yp < Y; yp++) {
				for (uint32_t n = 0; n < N; n++, d++) {
					lst[d] = old[d];
					if (op)
						lst[d] *= (*psi)[t][yp][y];
					else
						lst[d] += (*psi)[t][yp][y];
				}
			}
			// 2nd, init the back with the N first
			uint32_t *bk = &(*back)[t][y * N];
			for (uint32_t n = 0; n < N; n++)
				bk[n] = n;
			// 3rd, search the N highest values
			for (uint32_t i = N; i < N * Y; i++) {
				// Search the smallest current value
				uint32_t idx = 0;
				for (uint32_t n = 1; n < N; n++)
					if (lst[bk[n]] < lst[bk[idx]])
						idx = n;
				// And replace it if needed
				if (lst[i] > lst[bk[idx]])
					bk[idx] = i;
			}
			// 4th, get the new scores
			for (uint32_t n = 0; n < N; n++)
				cur[y * N + n] = lst[bk[n]];
		}
	}
	// Retrieving the best paths is similar to classical Viterbi except that
	// we have to search for the N bet ones and there is N time more
	// possibles starts.
	for (uint32_t n = 0; n < N; n++) {
		uint32_t bst = 0;
		for (uint32_t d = 1; d < Y * N; d++)
			if (cur[d] > cur[bst])
				bst = d;
		if (sc != NULL)
			sc[n] = cur[bst];
		cur[bst] = -DBL_MAX;
		for (uint32_t t = T; t > 0; t--) {
			const uint32_t yp = (t != 1) ? (*back)[t - 1][bst] / N: 0;
			const uint32_t y  = bst / N;
			out[t - 1][n] = y;
			if (psc != NULL)
				psc[t - 1][n] = (*psi)[t - 1][yp][y];
			bst = (*back)[t - 1][bst];
		}
	}
	free(old);
	free(cur);
	free(vback);
	xvm_free(vpsi);
}
Ejemplo n.º 3
0
void trn_lbfgs(mdl_t *mdl) {
	const size_t F  = mdl->nftr;
	const int    K  = mdl->opt->maxiter;
	const int    C  = mdl->opt->objwin;
	const int    M  = mdl->opt->lbfgs.histsz;
	const size_t W  = mdl->opt->nthread;
	const bool   l1 = mdl->opt->rho1 != 0.0;
	double *x, *xp; // Current and previous value of the variables
	double *g, *gp; // Current and previous value of the gradient
	double *pg;     // The pseudo-gradient (only for owl-qn)
	double *d;      // The search direction
	double *s[M];   // History value s_k = Δ(x,px)
	double *y[M];   // History value y_k = Δ(g,pg)
	double  p[M];   // ρ_k
	double  fh[C];  // f(x) history
	grd_t  *grds[W];
	// Initialization: Here, we have to allocate memory on the heap as we
	// cannot request so much memory on the stack as this will have a too
	// big impact on performance and will be refused by the system on non-
	// trivial models.
	x  = mdl->theta;
	xp = xvm_new(F); g = xvm_new(F);
	gp = xvm_new(F); d = xvm_new(F);
	for (int m = 0; m < M; m++) {
		s[m] = xvm_new(F);
		y[m] = xvm_new(F);
	}
	pg = l1 ? xvm_new(F) : NULL;
	grds[0] = grd_new(mdl, g);
	for (size_t w = 1; w < W; w++)
		grds[w] = grd_new(mdl, xvm_new(F));
	// Minimization: This is the heart of the function. (a big heart...) We
	// will perform iterations until one these conditions is reached
	//   - the maximum iteration count is reached
	//   - we have converged (upto numerical precision)
	//   - the report function return false
	//   - an error happen somewhere
	double fx = grd_gradient(mdl, g, grds);
	for (int k = 0; !uit_stop && k < K; k++) {
		// We first compute the pseudo-gradient of f for owl-qn. It is
		// defined in [3, pp 335(4)]
		//              | ∂_i^- f(x)   if ∂_i^- f(x) > 0
		//   ◇_i f(x) = | ∂_i^+ f(x)   if ∂_i^+ f(x) < 0
		//              | 0            otherwise
		// with
		//   ∂_i^± f(x) = ∂/∂x_i l(x) + | Cσ(x_i) if x_i ≠ 0
		//                              | ±C      if x_i = 0
		if (l1) {
			const double rho1 = mdl->opt->rho1;
			for (unsigned f = 0; f < F; f++) {
				if (x[f] < 0.0)
					pg[f] = g[f] - rho1;
				else if (x[f] > 0.0)
					pg[f] = g[f] + rho1;
				else if (g[f] < -rho1)
					pg[f] = g[f] + rho1;
				else if (g[f] > rho1)
					pg[f] = g[f] - rho1;
				else
					pg[f] = 0.0;
			}
		}
		// 1st step: We compute the search direction. We search in the
		// direction who minimize the second order approximation given
		// by the Taylor series which give
		//   d_k = - H_k^{-1} g_k
		// But computing the inverse of the hessian is intractable so
		// the l-bfgs only approximate it's diagonal. The exact
		// computation is well described in [1, pp 779].
		// The only special thing for owl-qn here is to use the pseudo
		// gradient instead of the true one.
		xvm_neg(d, l1 ? pg : g, F);
		if (k != 0) {
			const int km = k % M;
			const int bnd = (k <= M) ? k : M;
			double alpha[M], beta;
			// α_i = ρ_j s_j^T q_{i+1}
			// q_i = q_{i+1} - α_i y_i
			for (int i = bnd; i > 0; i--) {
				const int j = (k - i + M + 1) % M;
				alpha[i - 1] = p[j] * xvm_dot(s[j], d, F);
				xvm_axpy(d, -alpha[i - 1], y[j], d, F);
			}
			// r_0 = H_0 q_0
			//     Scaling is described in [2, pp 515]
			//     for k = 0: H_0 = I
			//     for k > 0: H_0 = I * y_k^T s_k / ||y_k||²
			//                    = I * 1 / ρ_k ||y_k||²
			const double y2 = xvm_dot(y[km], y[km], F);
			const double v = 1.0 / (p[km] * y2);
			for (size_t f = 0; f < F; f++)
				d[f] *= v;
			// β_j     = ρ_j y_j^T r_i
			// r_{i+1} = r_i + s_j (α_i - β_i)
			for (int i = 0; i < bnd; i++) {
				const int j = (k - i + M) % M;
				beta = p[j] * xvm_dot(y[j], d, F);
				xvm_axpy(d, alpha[i] - beta, s[j], d, F);
			}
		}
		// For owl-qn, we must remain in the same orthant than the
		// pseudo-gradient, so we have to constrain the search
		// direction as described in [3, pp 35(3)]
		//   d^k = π(d^k ; v^k)
		//       = π(d^k ; -◇f(x^k))
		if (l1)
			for (size_t f = 0; f < F; f++)
				if (d[f] * pg[f] >= 0.0)
					d[f] = 0.0;
		// 2nd step: we perform a linesearch in the computed direction,
		// we search a step value that satisfy the constrains using a
		// backtracking algorithm. Much elaborated algorithm can perform
		// better in the general case, but for CRF training, bactracking
		// is very efficient and simple to implement.
		// For quasi-Newton, the natural step is 1.0 so we start with
		// this one and reduce it only if it fail with an exception for
		// the first step where a better guess can be done.
		// We have to keep track of the current point and gradient as we
		// will need to compute the delta between those and the found
		// point, and perhaps need to restore them if linesearch fail.
		memcpy(xp, x, sizeof(double) * F);
		memcpy(gp, g, sizeof(double) * F);
		double sc  = (k == 0) ? 0.1 : 0.5;
		double stp = (k == 0) ? 1.0 / xvm_norm(d, F) : 1.0;
		double gd  = l1 ? 0.0 : xvm_dot(g, d, F); // gd = g_k^T d_k
		double fi  = fx;
		bool err = false;
		for (int ls = 1; !uit_stop; ls++, stp *= sc) {
			// We compute the new point using the current step and
			// search direction
			xvm_axpy(x, stp, d, xp, F);
			// For owl-qn, we have to project back the point in the
			// current orthant [3, pp 35]
			//   x^{k+1} = π(x^k + αp^k ; ξ)
			if (l1) {
				for (size_t f = 0; f < F; f++) {
					double or = xp[f];
					if (or == 0.0)
						or = -pg[f];
					if (x[f] * or <= 0.0)
						x[f] = 0.0;
				}
			}
			// And we ask for the value of the objective function
			// and its gradient.
			fx = grd_gradient(mdl, g, grds);
			// Now we check if the step satisfy the conditions. For
			// l-bfgs, we check the classical decrease and curvature
			// known as the Wolfe conditions [2, pp 506]
			//   f(x_k + α_k d_k) ≤ f(x_k) + β' α_k g_k^T d_k
			//   g(x_k + α_k d_k)^T d_k ≥ β g_k^T d_k
			//
			// And for owl-qn we check a variant of the Armijo rule
			// described in [3, pp 36]
			//   f(π(x^k+αp^k;ξ)) ≤ f(x^k) - γv^T[π(x^k+αp^k;ξ)-x^k]
			if (!l1) {
				if (fx > fi + stp * gd * 1e-4)
					sc = 0.5;
				else if (xvm_dot(g, d, F) < gd * 0.9)
					sc = 2.1;
				else
					break;
			} else {
				double vp = 0.0;
				for (size_t f = 0; f < F; f++)
					vp += (x[f] - xp[f]) * d[f];
				if (fx < fi + vp * 1e-4)
					break;
			}
			// If we reach the maximum number of linesearsh steps
			// without finding a good one, we just fail.
			if (ls == mdl->opt->lbfgs.maxls) {
				warning("maximum linesearch reached");
				err = true;
				break;
			}
		}
		// If linesearch failed or user interupted training, we return
		// to the last valid point and stop the training. The model is
		// probably not fully optimized but we let the user decide what
		// to do with it.
		if (err || uit_stop) {
			memcpy(x, xp, sizeof(double) * F);
			break;
		}
		if (uit_progress(mdl, k + 1, fx) == false)
			break;
		// 3rd step: we update the history used for approximating the
		// inverse of the diagonal of the hessian
		//   s_k = x_{k+1} - x_k
		//   y_k = g_{k+1} - g_k
		//   ρ_k = 1 / y_k^T s_k
		const int kn = (k + 1) % M;
		xvm_sub(s[kn], x, xp, F);
		xvm_sub(y[kn], g, gp, F);
		p[kn] = 1.0 / xvm_dot(y[kn], s[kn], F);
		// And last, we check for convergence. The convergence check is
		// quite simple [2, pp 508]
		//   ||g|| / max(1, ||x||) ≤ ε
		// with ε small enough so we stop when numerical precision is
		// reached. For owl-qn we just have to check against the pseudo-
		// gradient instead of the true one.
		const double xn = xvm_norm(x, F);
		const double gn = xvm_norm(l1 ? pg : g, F);
		if (gn / max(xn, 1.0) <= 1e-5)
			break;
		if (k + 1 == K)
			break;
		// Second stoping criterion tested is a check for improvement of
		// the function value over the past W iteration. When this come
		// under an epsilon, we also stop the minimization.
		fh[k % C] = fx;
		double dlt = 1.0;
		if (k >= C) {
			const double of = fh[(k + 1) % C];
			dlt = fabs(of - fx) / of;
			if (dlt < mdl->opt->stopeps)
				break;
		}
	}
	// Cleanup: We free all the vectors we have allocated.
	xvm_free(xp); xvm_free(g);
	xvm_free(gp); xvm_free(d);
	for (int m = 0; m < M; m++) {
		xvm_free(s[m]);
		xvm_free(y[m]);
	}
	if (l1)
		xvm_free(pg);
	for (size_t w = 1; w < W; w++)
		xvm_free(grds[w]->g);
	for (size_t w = 0; w < W; w++)
		grd_free(grds[w]);
}