/** * Compute loss and gradient of Huber hinge loss. * CAUTION: f is passed by reference and is changed within this * function. This is done for efficiency reasons, otherwise we would * have had to create a new copy of f. * * @param loss [write] loss value computed. * @param f [read/write] prediction vector. * @param l [write] partial derivative of loss function w.r.t. f */ void CHuberHingeLoss::LossAndGrad(double& loss, TheMatrix& f, TheMatrix& l) { f.ElementWiseMult(_data->labels()); double* yf = f.Data(); double* Y = _data->labels().Data(); int len = f.Length(); loss = 0.0; l.Zero(); for(int i=0; i < len; i++) { double v = 1-yf[i]; if(h < v) { loss += v; l.Set(i,-Y[i]); } else if(-h > v) {} else { loss += (v+h)*(v+h)/4/h; l.Set(i, -Y[i]*(v+h)/2/h); } } }
/** * Compute loss and partial derivative of hinge loss w.r.t f * * @param loss [write] loss value computed. * @param f [r/w] = X*w * @param l [write] partial derivative of loss w.r.t. f */ void CLogisticLoss::LossAndGrad(double& loss, TheMatrix& f, TheMatrix& l) { l.Zero(); // for gradient computation i.e. grad := l'*X f.ElementWiseMult(_data->labels()); double* f_array = f.Data(); // pointer to memory location of f (faster element access) int len = f.Length(); double exp_yf = 0.0; for(int i=0; i < len; i++) { if(fabs(f_array[i]) == 0.0) { loss += LN2; l.Set(i,-0.5); } else if (f_array[i] > 0.0) { exp_yf = exp(-f_array[i]); loss += log(1+exp_yf); l.Set(i,-exp_yf/(1+exp_yf)); } else { exp_yf = exp(f_array[i]); loss += log(1+exp_yf) - f_array[i]; l.Set(i,-1.0/(1+exp_yf)); } } l.ElementWiseMult(_data->labels()); }
/** The subgradient is chosen as sgn(w) */ void CL1N1::ComputeRegAndGradient(CModel& model, double& reg, TheMatrix& grad) { reg = 0; TheMatrix &w = model.GetW(); w.Norm1(reg); grad.Zero(); for(int i=0; i<w.Length(); i++) { double val = 0; w.Get(i,val); grad.Set(i,SML::sgn(val)); } }
/** * Compute loss and gradient of Least Absolute Deviation loss w.r.t f * * @param loss [write] loss value computed. * @param f [r/w] = X*w * @param l [write] partial derivative of loss w.r.t. f */ void CLeastAbsDevLoss::LossAndGrad(double& loss, TheMatrix& f, TheMatrix& l) { loss = 0; l.Zero(); double *Y_array = _data->labels().Data(); double* f_array = f.Data(); int len = f.Length(); for(int i=0; i < len; i++) { double f_minus_y = f_array[i] - Y_array[i]; loss += fabs(f_minus_y); l.Set(i, SML::sgn(f_minus_y)); } }
/** * Compute loss and gradient of novelty detection loss. * CAUTION: f is passed by reference and is changed within this * function. This is done for efficiency reasons, otherwise we would * have had to create a new copy of f. * * @param loss [write] loss value computed. * @param f [read/write] prediction vector. * @param l [write] partial derivative of loss function w.r.t. f */ void CNoveltyLoss::LossAndGrad(double& loss, TheMatrix& f, TheMatrix& l) { double* f_array = f.Data(); // pointer to memory location of f (faster element access) int len = f.Length(); l.Zero(); // grad := l'*X for(int i=0; i < len; i++) { if(rho > f_array[i]) { loss += rho - f_array[i]; l.Set(i, -1.0); } } }
/** * Compute loss and partial derivative of NDCGRank loss w.r.t f * * @param loss [write] loss value computed. * @param f [r/w] = X*w * @param l [write] partial derivative of loss w.r.t. f */ void CNDCGRankLoss::LossAndGrad(Scalar& loss, TheMatrix& f, TheMatrix& l) { // chteo: here we make use of the subset information loss = 0.0; l.Zero(); Scalar* f_array = f.Data(); for(int q=0; q < _data->NumOfSubset(); q++) { //cout << "q = "<< q <<endl; int offset = _data->subset[q].startIndex; int subsetsize = _data->subset[q].size; current_ideal_pi = sort_vectors[q]; vector<double> b = bs[q]; //compute_coefficients(offset, subsetsize, y_array, current_ideal_pi, a, b); //cout << "before finding permutation\n"; /* find the best permutation */ find_permutation(subsetsize, offset, a, b, c, f_array, pi); //cout << "after finding permutation\n"; //cout << "before finding delta\n"; /* compute the loss */ double value; delta(subsetsize, a, b, pi, value); //cout << "before finding delta\n"; loss += value; for (int i=0;i<subsetsize;i++){ loss = loss + c[i]*(get(f_array, offset, pi[i]) - get(f_array, offset, i)); } for (int i=0;i<subsetsize;i++){ //add(l, offset, i, c[pi[i]] - c[i]); add(l, offset, i, - c[i]); add(l, offset, pi[i], c[i]); } } }
void CGenericLoss::ComputeLossAndGradient(double& loss, TheMatrix& grad) { loss = 0; grad.Zero(); TheMatrix &w = _model->GetW(); double* dat = w.Data(); double* raw_g = grad.Data(); { double* resy; double* resybar; map<int,int> ybar; resy = new double [data->dim()]; resybar = new double [data->dim()]; minimize(data->nodeFeatures, &(data->nodeLabels), data->edgeFeatures, dat, dat + data->nNodeFeatures, ybar, data->nNodeFeatures, data->nEdgeFeatures, data->lossPositive, data->lossNegative, data->indexEdge, NULL, 1, data->firstOrderResponses); Phi(data->nodeFeatures, &(data->nodeLabels), data->edgeFeatures, data->nNodeFeatures, data->nEdgeFeatures, resy, resy + data->nNodeFeatures, data->indexEdge); Phi(data->nodeFeatures, &ybar, data->edgeFeatures, data->nNodeFeatures, data->nEdgeFeatures, resybar, resybar + data->nNodeFeatures, data->indexEdge); loss += LabelLoss(data->nodeLabels, ybar, data->lossPositive, data->lossNegative, LOSS); for (int j = 0; j < (int) data->dim(); j ++) { loss += dat[j]*(resybar[j]-resy[j]); raw_g[j] += (1.0/data->N)*(resybar[j]-resy[j]); } delete [] resy; delete [] resybar; } loss = loss/data->N; }
/** Compute loss and gradient */ void CSMMMulticlassLoss::ComputeLossAndGradient(double& loss, TheMatrix& grad) { iterNum ++; TheMatrix &w = _model->GetW(); loss = 0; grad.Zero(); TheMatrix g(grad, SML::DENSE); const vector<CSeqMulticlassLabel::seqlabel_struct> &Y = _data->labels(); const vector<CSeqMulticlassFeature::seqfeature_struct> &X = _data->features(); unsigned int trainExNum = 0; vector <int > cvmark = _data->Getcvmark(); for(unsigned int i=0; i < m; i++) { if(cvmark.size()!=0) { if(cvmark[i]!=SMM::TRAIN_DATA) continue; } trainExNum ++; //if(cvmark) vector<unsigned int> ybar(X[i].len,0); vector<unsigned int> ybarlabel(X[i].len,0); double labelloss = 0; double marginloss = 0; double w_dot_g = 0.0;; // find best label y' and return the score wrt to y' if(verbosity>=2) { cout <<"ex:"<< i<< endl;fflush(stdout); } if(is_single_action_persequence) find_best_label_grammer(Y[i].pos,Y[i].type, X[i], w, ybar, ybarlabel, marginloss, labelloss, 0, _data->getNumOfClass()); else find_best_label(Y[i].pos,Y[i].type, X[i], w, ybar, ybarlabel, marginloss, labelloss, 0, _data->getNumOfClass()); double labelloss_y = 0; double marginloss_y = 0; double labelloss_ybar = 0; double marginloss_ybar = 0; ComputeLoss(Y[i].pos,Y[i].type,ybar,ybarlabel,X[i],w,marginloss_ybar,labelloss_ybar,1); if(lossw[0]!=0) labelloss+=lossw[0]; if(lastDuration>0) { marginloss = marginloss_ybar; labelloss = labelloss_ybar; } if(verbosity>=3) { ComputeLoss(Y[i].pos,Y[i].type,Y[i].pos,Y[i].type,X[i],w,marginloss_y,labelloss_y,1); printf("dp------marginloss:%2.4f---labelloss:%2.4f------\n",marginloss,labelloss); printf("ybar----marginloss:%2.4f---labelloss:%2.4f------\n",marginloss_ybar,labelloss_ybar); printf("y-------marginloss:%2.4f---labelloss:%2.4f------\n",marginloss_y,labelloss_y); if(abs(labelloss_ybar-labelloss)>1e-5) { printf("labelloss doesn't match!\n"); //exit(0); } if(abs(marginloss_ybar-marginloss)>1e-5) { printf("marginloss_ybar_dp:%2.4f != marginloss_ybar_computeLoss:%2.4f\n",marginloss,marginloss_ybar); printf("marginloss doesn't match!\n"); } } // construct the gradient vector for the part of true y const vector<unsigned int> &y = Y[i].pos; const vector<unsigned int> &ylabel = Y[i].type; g.Zero(); for(unsigned int j=0; j < y.size(); j++) { //g.Add(*(X[i].phi_1[y[j]])); //g.Add(*(X[i].phi_2[y[j-1]][y[j]-y[j-1]-1])); _data->TensorPhi1(X[i].phi_1[y[j]],ylabel[j],0,tphi_1); g.Add(*tphi_1); if(j > 0) { _data->TensorPhi2(X[i].phi_2[y[j-1]][y[j]-y[j-1]-1], ylabel[j-1], ylabel[j], 0,0,tphi_2); g.Add(*tphi_2); } } if(y.size() > 0) { //g.Add(*(X[i].phi_2[y[y.size()-1]][X[i].len-1 - y[y.size()-1]-1]));//// _data->TensorPhi2(X[i].phi_2[y[y.size()-1]][X[i].len - y[y.size()-1]-1 ], ylabel[y.size()-1], 0,0,0,tphi_2); g.Add(*tphi_2); } // for predicted y' for(unsigned int j=0; j < ybar.size(); j++) { //grad.Add(*(X[i].phi_1[ybar[j]])); //grad.Add(*(X[i].phi_2[ybar[j-1]][ybar[j]-ybar[j-1]-1])); _data->TensorPhi1(X[i].phi_1[ybar[j]],ybarlabel[j],0,tphi_1); grad.Add(*tphi_1); if(j>0) { _data->TensorPhi2(X[i].phi_2[ybar[j-1]][ybar[j]-ybar[j-1]-1], ybarlabel[j-1], ybarlabel[j], 0,0,tphi_2); grad.Add(*tphi_2); //// } } if(ybar.size() > 0) { //grad.Add(*(X[i].phi_2[ybar[ybar.size()-1]][X[i].len-1 - ybar[ybar.size()-1]-1])); _data->TensorPhi2(X[i].phi_2[ybar[ybar.size()-1]][X[i].len - ybar[ybar.size()-1]-1 ], ybarlabel[ybar.size()-1], 0, 0,0,tphi_2); grad.Add(*tphi_2); } grad.Minus(g); // accumulate the loss w.Dot(g, w_dot_g); loss = loss - w_dot_g + marginloss + labelloss; } scalingFactor = 1.0/trainExNum; grad.Scale(scalingFactor); loss *= scalingFactor; if(verbosity) { double gnorm = 0.0; grad.Norm2(gnorm); cout << "gradient norm=" << gnorm << endl; } //Evaluate(_model); }