/// Perform one iteration of the SAG algorithm with gain eta /// Argument i is the index of the loss in the saved dloss vector. void SvmSag::trainOne(const SVector &x, double y, double eta, int i) { // compute loss double s = dot(w,x) * wa + wBias; if (wb != 0) s += dot(g,x) * wb; // compute dloss double d = LOSS::dloss(s, y); double od = sd[i - sdimin]; sd[i - sdimin] = d; d = d - od; // update weights g.add(x, d); w.add(x, - d * wb / wa); double decay = 1 - lambda * eta; wa = wa * decay; wb = wb * decay + eta / m; if (wa < 1e-5) renorm(); // same for the bias #if BIAS double etab = eta * 0.01; gBias += d; #if REGULARIZED_BIAS wBias *= (1 - etab * lambda); #endif wBias += etab * gBias / m; #endif }
/// Perform one iteration of the SGD algorithm with specified gain void SvmAsgd::trainOne(const SVector &x, double y, double eta, double mu) { // Renormalize if needed if (aDivisor > 1e5 || wDivisor > 1e5) renorm(); // Forward double s = dot(w,x) / wDivisor + wBias; // SGD update for regularization term wDivisor = wDivisor / (1 - eta * lambda); // SGD update for loss term double d = LOSS::dloss(s, y); double etd = eta * d * wDivisor; if (etd != 0) w.add(x, etd); // Averaging if (mu >= 1) { a.clear(); aDivisor = wDivisor; wFraction = 1; } else if (mu > 0) { if (etd != 0) a.add(x, - wFraction * etd); aDivisor = aDivisor / (1 - mu); wFraction = wFraction + mu * aDivisor / wDivisor; } // same for the bias #if BIAS double etab = eta * 0.01; #if REGULARIZED_BIAS wBias *= (1 - etab * lambda); #endif wBias += etab * d; aBias += mu * (wBias - aBias); #endif }
//Winnie, train on samples, because the data has been randomly shuffled, just use the first m instances. void SvmSgd::pre_trainOne(const SVector &x, double y, double eta) { double s = dot(w,x) / wDivisor + wBias; // update for regularization term wDivisor = wDivisor / (1 - eta * lambda); if (wDivisor > 1e5) renorm(); // update for loss term double d = LOSS::dloss(s, y); if (d != 0) w.add(x, eta * d * wDivisor); // same for the bias #if BIAS double etab = eta * 0.01; #if REGULARIZED_BIAS wBias *= (1 - etab * lambda); #endif wBias += etab * d; #endif }
void SvmSgd::train(int imin, int imax, const xvec_t &xp, const yvec_t &yp, const char *prefix) { cout << prefix << "Training on [" << imin << ", " << imax << "]." << endl; assert(imin <= imax); count = skip; for (int i=imin; i<=imax; i++) { const SVector &x = xp.at(i); double y = yp.at(i); double wx = dot(w,x); double z = y * (wx + bias); double eta = 1.0 / (lambda * t); #if LOSS < LOGLOSS if (z < 1) #endif { double etd = eta * dloss(z); w.add(x, etd * y); #if BIAS #if REGULARIZEBIAS bias *= 1 - eta * lambda * bscale; #endif bias += etd * y * bscale; #endif } if (--count <= 0) { double r = 1 - eta * lambda * skip; if (r < 0.8) r = pow(1 - eta * lambda, skip); w.scale(r); count = skip; } t += 1; } cout << prefix << setprecision(6) << "Norm: " << dot(w,w) << ", Bias: " << bias << endl; }
/// Perform one SGD iteration (used to determine eta) void SvmSag::trainSgdOne(const SVector &x, double y, double eta, int i) { assert(wb == 0); double s = dot(w,x) * wa + wBias; wa = wa * (1 - eta * lambda); if (wa < 1e-5) renorm(); double d = LOSS::dloss(s, y); if (i >= 0) sd[i-sdimin] = d; if (d != 0) w.add(x, eta * d / wa); #if BIAS double etab = eta * 0.01; #if REGULARIZED_BIAS wBias *= (1 - etab * lambda); #endif wBias += etab * d; #endif }
void SvmSgd::train(int imin, int imax, const xvec_t &xp, const yvec_t &yp, const char *prefix) { cout << prefix << "Training on [" << imin << ", " << imax << "]." << endl; assert(imin <= imax); for (int i=imin; i<=imax; i++) { double eta = 1.0 / (lambda * t); double s = 1 - eta * lambda; wscale *= s; if (wscale < 1e-9) { w.scale(wscale); wscale = 1; } const SVector &x = xp.at(i); double y = yp.at(i); double wx = dot(w,x) * wscale; double z = y * (wx + bias); #if LOSS < LOGLOSS if (z < 1) #endif { double etd = eta * dloss(z); w.add(x, etd * y / wscale); #if BIAS // Slower rate on the bias because // it learns at each iteration. bias += etd * y * 0.01; #endif } t += 1; } double wnorm = dot(w,w) * wscale * wscale; cout << prefix << setprecision(6) << "Norm: " << wnorm << ", Bias: " << bias << endl; }
/// Perform one iteration of the SGD algorithm with specified gain /// This is the only function differentiating the averaged implicit from the /// averaged (explicit) implementation. We simply merge the implementations for /// the implicit update with averaging. void SvmAisgd::trainOne(const SVector &x, double y, double eta, double mu) { double etd = 0; // HingeLoss case. if(LOSS::name().compare("HingeLoss")==0) { double ypred = dot(x, w) / wDivisor; double implicitFactor = (1 + lambda * eta); if(1 - y * ypred / implicitFactor < 0) { wDivisor *= implicitFactor; // Update will be W_n+1 = Wn / (1+lambda * eta) } else { double ypred = 0; // computes x_t' theta_{t+1} (next update) for(const SVector::Pair *p = x; p->i >= 0; p++) { double w_i = w.get(p->i) / wDivisor; ypred += p->v * (w_i + p->v * eta * y); } if(1 - y * ypred / implicitFactor >= 0) { etd = eta * y * wDivisor; w.add(x, etd); wDivisor *= implicitFactor; // Update should be theta_{t+!1} = (1/(1+lambda eta)) * (theta_t + eta * yt * xt) } else { // do nothing (no update in parameters). } } if (wDivisor > 1e5) renorm(); } else if(LOSS::name().compare("LogLoss")==0) { // Need to solve ξ_t = at (yt - h(theta_t' xt + ξt ||xt||^2)) // Solve approximately by using // ξt = (1 / (1 + at ||xt||^2 h'(theta_t'xt)) * at * (yt - h(theta_t' xt)) // TODO(ptoulis): Use implicit Algorithm 1 of (Toulis, et.al., ICML14) double wx = dot(w, x) / wDivisor; double ypred = 2 * (exp(wx) / (1 + exp(wx))) - 1; double implicitFactor = 1 + eta * dot(x, x) * ypred / (1 + exp(wx)); double ksi_t = (1 / implicitFactor) * eta * (y - ypred); etd = wDivisor * ksi_t; w.add(x, etd); } else { cout << "#" << LOSS::name() << "# -- loss not found."; } // Averaging if (mu >= 1) { a.clear(); aDivisor = wDivisor; wFraction = 1; } else if (mu > 0) { if (etd != 0) a.add(x, - wFraction * etd); aDivisor = aDivisor / (1 - mu); wFraction = wFraction + mu * aDivisor / wDivisor; } }