/// Perform one iteration of the SGD algorithm with specified gain void SvmAsgd::trainOne(const SVector &x, double y, double eta, double mu) { // Renormalize if needed if (aDivisor > 1e5 || wDivisor > 1e5) renorm(); // Forward double s = dot(w,x) / wDivisor + wBias; // SGD update for regularization term wDivisor = wDivisor / (1 - eta * lambda); // SGD update for loss term double d = LOSS::dloss(s, y); double etd = eta * d * wDivisor; if (etd != 0) w.add(x, etd); // Averaging if (mu >= 1) { a.clear(); aDivisor = wDivisor; wFraction = 1; } else if (mu > 0) { if (etd != 0) a.add(x, - wFraction * etd); aDivisor = aDivisor / (1 - mu); wFraction = wFraction + mu * aDivisor / wDivisor; } // same for the bias #if BIAS double etab = eta * 0.01; #if REGULARIZED_BIAS wBias *= (1 - etab * lambda); #endif wBias += etab * d; aBias += mu * (wBias - aBias); #endif }
/// Perform one iteration of the SGD algorithm with specified gain /// This is the only function differentiating the averaged implicit from the /// averaged (explicit) implementation. We simply merge the implementations for /// the implicit update with averaging. void SvmAisgd::trainOne(const SVector &x, double y, double eta, double mu) { double etd = 0; // HingeLoss case. if(LOSS::name().compare("HingeLoss")==0) { double ypred = dot(x, w) / wDivisor; double implicitFactor = (1 + lambda * eta); if(1 - y * ypred / implicitFactor < 0) { wDivisor *= implicitFactor; // Update will be W_n+1 = Wn / (1+lambda * eta) } else { double ypred = 0; // computes x_t' theta_{t+1} (next update) for(const SVector::Pair *p = x; p->i >= 0; p++) { double w_i = w.get(p->i) / wDivisor; ypred += p->v * (w_i + p->v * eta * y); } if(1 - y * ypred / implicitFactor >= 0) { etd = eta * y * wDivisor; w.add(x, etd); wDivisor *= implicitFactor; // Update should be theta_{t+!1} = (1/(1+lambda eta)) * (theta_t + eta * yt * xt) } else { // do nothing (no update in parameters). } } if (wDivisor > 1e5) renorm(); } else if(LOSS::name().compare("LogLoss")==0) { // Need to solve ξ_t = at (yt - h(theta_t' xt + ξt ||xt||^2)) // Solve approximately by using // ξt = (1 / (1 + at ||xt||^2 h'(theta_t'xt)) * at * (yt - h(theta_t' xt)) // TODO(ptoulis): Use implicit Algorithm 1 of (Toulis, et.al., ICML14) double wx = dot(w, x) / wDivisor; double ypred = 2 * (exp(wx) / (1 + exp(wx))) - 1; double implicitFactor = 1 + eta * dot(x, x) * ypred / (1 + exp(wx)); double ksi_t = (1 / implicitFactor) * eta * (y - ypred); etd = wDivisor * ksi_t; w.add(x, etd); } else { cout << "#" << LOSS::name() << "# -- loss not found."; } // Averaging if (mu >= 1) { a.clear(); aDivisor = wDivisor; wFraction = 1; } else if (mu > 0) { if (etd != 0) a.add(x, - wFraction * etd); aDivisor = aDivisor / (1 - mu); wFraction = wFraction + mu * aDivisor / wDivisor; } }