double GradientDD::computeGradient(dVector& vecGradrient, Model* m,DataSequence*) { dVector tmpVec; vecGradrient = *(m->getWeights()); vecGradrient.add(mu); tmpVec = vecGradrient; tmpVec.transpose(); tmpVec.multiply(vecGradrient); double f = exp(-0.5*tmpVec[0]); vecGradrient.multiply(f); return f; }
void Toolbox::calculateGlobalMean(DataSet &X,dVector& mean) { dVector seqSum; int nbElements = 0; //Calculate mean for(int i = 0;i < (int)X.size() ;i++) { X.at(i)->getPrecomputedFeatures()->rowSum(seqSum); mean.add(seqSum); nbElements+=X.at(i)->getPrecomputedFeatures()->getWidth(); } mean.multiply(1.0/(double)nbElements); }
double GradientHCRF::computeGradient(dVector& vecGradient, Model* m, DataSequence* X) { int nbFeatures = pFeatureGen->getNumberOfFeatures(); int NumSeqLabels=m->getNumberOfSequenceLabels(); //Get adjency matrix uMatrix adjMat; m->getAdjacencyMatrix(adjMat, X); if(vecGradient.getLength() != nbFeatures) vecGradient.create(nbFeatures); dVector Partition; Partition.resize(1,NumSeqLabels); std::vector<Beliefs> ConditionalBeliefs(NumSeqLabels); // Step 1 : Run Inference in each network to compute marginals conditioned on Y for(int i=0; i<NumSeqLabels; i++) { pInfEngine->computeBeliefs(ConditionalBeliefs[i],pFeatureGen, X, m, true,i); Partition[i] = ConditionalBeliefs[i].partition; } double f_value = Partition.logSumExp() - Partition[X->getSequenceLabel()]; // Step 2: Compute expected values for feature nodes conditioned on Y #if !defined(_VEC_FEATURES) && !defined(_OPENMP) featureVector* vecFeatures; #endif #if defined(_OPENMP) int ThreadID = omp_get_thread_num(); if (ThreadID >= nbThreadsMP) ThreadID = 0; #else int ThreadID = 0; #endif double value; dMatrix CEValues; CEValues.resize(nbFeatures,NumSeqLabels); //Loop over nodes to compute features and update the gradient for(int j=0; j<NumSeqLabels; j++) { //For every labels for(int i = 0; i < X->length(); i++) {//For every nodes #if defined(_VEC_FEATURES) || defined(_OPENMP) pFeatureGen->getFeatures(vecFeaturesMP[ThreadID], X,m,i,-1,j); // Loop over features feature* pFeature = vecFeaturesMP[ThreadID].getPtr(); for(int k = 0; k < vecFeaturesMP[ThreadID].size(); k++, pFeature++) #else vecFeatures =pFeatureGen->getFeatures(X,m,i,-1,j); // Loop over features feature* pFeature = vecFeatures->getPtr(); for(int k = 0; k < vecFeatures->size(); k++, pFeature++) #endif { //p(s_i=s|x,Y) * f_k(i,s,x,y) value=ConditionalBeliefs[j].belStates[i][pFeature->nodeState] * pFeature->value; CEValues.setValue(j,pFeature->globalId, CEValues(j,pFeature->globalId) + value); // one row for each Y }// end for every feature }// end for every node }// end for ever Sequence Label // Step 3: Compute expected values for edge features conditioned on Y //Loop over edges to compute features and update the gradient for(int j=0; j<NumSeqLabels; j++) { int edgeIndex = 0; for(int row = 0; row < X->length(); row++) { // Loop over all rows (the previous node index) for(int col = row; col < X->length() ; col++) { //Loop over all columns (the current node index) if(adjMat(row,col) == 1) { //Get nodes features #if defined(_VEC_FEATURES) || defined(_OPENMP) pFeatureGen->getFeatures(vecFeaturesMP[ThreadID], X,m,col,row,j); // Loop over features feature* pFeature = vecFeaturesMP[ThreadID].getPtr(); for(int k = 0; k < vecFeaturesMP[ThreadID].size(); k++, pFeature++) #else vecFeatures = pFeatureGen->getFeatures(X,m,col,row,j); // Loop over features feature* pFeature = vecFeatures->getPtr(); for(int k = 0; k < vecFeatures->size(); k++, pFeature++) #endif { //p(y_i=s1,y_j=s2|x,Y)*f_k(i,j,s1,s2,x,y) value=ConditionalBeliefs[j].belEdges[edgeIndex](pFeature->prevNodeState,pFeature->nodeState) * pFeature->value; CEValues.setValue(j,pFeature->globalId, CEValues(j,pFeature->globalId) + value); } edgeIndex++; } } } } // Step 4: Compute Joint Expected Values dVector JointEValues; JointEValues.resize(1,nbFeatures); JointEValues.set(0); dVector rowJ; rowJ.resize(1,nbFeatures); dVector GradientVector; double sumZLog=Partition.logSumExp(); for (int j=0; j<NumSeqLabels; j++) { CEValues.getRow(j, rowJ); rowJ.multiply(exp(Partition.getValue(j)-sumZLog)); JointEValues.add(rowJ); } // Step 5 Compute Gradient as Exi[i,*,*] -Exi[*,*,*], that is difference // between expected values conditioned on Sequence Labels and Joint expected // values CEValues.getRow(X->getSequenceLabel(), rowJ); // rowJ=Expected value // conditioned on Sequence // label Y // [Negation moved to Gradient::ComputeGradient by LP] // rowJ.negate(); JointEValues.negate(); rowJ.add(JointEValues); vecGradient.add(rowJ); return f_value; }
double Gradient::computeGradient(dVector& vecGradient, Model* m, DataSet* X) { double ans = 0.0; #ifdef _OPENMP if( nbThreadsMP < 1 ) nbThreadsMP = omp_get_max_threads(); setMaxNumberThreads(nbThreadsMP); pInfEngine->setMaxNumberThreads(nbThreadsMP); pFeatureGen->setMaxNumberThreads(nbThreadsMP); #endif //Check the size of vecGradient int nbFeatures = pFeatureGen->getNumberOfFeatures(); if(vecGradient.getLength() != nbFeatures) vecGradient.create(nbFeatures); else vecGradient.set(0); //////////////////////////////////////////////////////////// // Start of parallel Region // Some weird stuff in gcc 4.1, with openmp 2.5 support // // Note 1: In OpenMP 2.5, the iteration variable in "for" must be // a signed integer variable type. In OpenMP 3.0 (_OPENMP>=200805), // it may also be an unsigned integer variable type, a pointer type, // or a constant-time random access iterator type. // // Note 2: schedule(static | dynamic): In the dynamic schedule, there // is no predictable order in which the loop items are assigned to // different threads. Each thread asks the OpenMP runtime library for // an iteration number, then handles it, then asks for the next one. // It is thus useful when different iterations in the loop may take // different time to execute. #pragma omp parallel default(none) \ shared(vecGradient, X, m, ans, nbFeatures, std::cout) { // code inside this region runs in parallel dVector g(nbFeatures, COLVECTOR, 0.0); #pragma omp for schedule(dynamic) reduction(+:ans) for(int i=0; (int)i<X->size(); i++) { DataSequence* x = X->at(i); if( m->isWeightSequence() && x->getWeightSequence() != 1.0) { dVector tmp(nbFeatures, COLVECTOR, 0.0); ans += computeGradient(tmp, m, x) * x->getWeightSequence(); tmp.multiply(x->getWeightSequence()); g.add(tmp); } else { ans += computeGradient(g, m, x); } } // We now put togheter the gradients // No two threads can execute a critical directive of the same name at the same time #pragma omp critical (reduce_sum) { vecGradient.add(g); } } // End of parallel Region //////////////////////////////////////////////////////////// vecGradient.negate(); // MaxMargin objective: min L = 0.5*\L2sigma*W*W + Loss() // MLE objective: min L = 0.5*1/(\L2sigma*\L2sigma)*W*W - log p(y|x) // Add the regularization term double scale = (m->isMaxMargin()) ? m->getRegL2Sigma() : 1/(double)(m->getRegL2Sigma()*m->getRegL2Sigma()); if( m->isMaxMargin() ) ans = (1/(double)X->size()) * ans; if(m->getRegL2Sigma()!=0.0f) { for(int f=0; f<nbFeatures; f++) vecGradient[f] += (*m->getWeights())[f]*scale; ans += 0.5*scale*m->getWeights()->l2Norm(false); } return ans; }
// Based on A.2 Structured Loss in Teo et al. JMLR 2010, p343-344, double GradientCRF::computeGradientMaxMargin(dVector& vecGradient, Model* m, DataSequence* X) { int xi, xj, yi, yj, k, nbFeatures; double val, phi_star=0, phi_true=0, hamming_loss=0; Beliefs bel; pInfEngine->computeBeliefs(bel,pFeatureGen, X, m, false); // Compute Hamming loss iVector ystar; dVector pystar; viterbiDecoding(bel,ystar,pystar); for(xi=0; xi<X->length(); xi++) if( X->getStateLabels(xi) != ystar[xi] ) hamming_loss++; // Compute gradients feature* f; featureVector vecFeatures; nbFeatures = pFeatureGen->getNumberOfFeatures(); if(vecGradient.getLength() != nbFeatures) vecGradient.create(nbFeatures); dVector *w = m->getWeights(); dVector localGrad(nbFeatures); // Loop over nodes to compute features and update the gradient for(xi=0; xi<X->length(); xi++) { // Read the label for this state yi = X->getStateLabels(xi); //Get nodes features pFeatureGen->getFeatures(vecFeatures,X,m,xi,-1); f = vecFeatures.getPtr(); for(k=0; k<vecFeatures.size(); k++, f++) { if(f->nodeState==yi) { phi_true += w->getValue(f->id) * f->value; localGrad[f->id] -= f->value; } else if(f->nodeState==ystar[xi]) { phi_star += w->getValue(f->id) * f->value; localGrad[f->id] += f->value; } val = bel.belStates[xi][f->nodeState]*f->value; vecGradient[f->id] -= val; } } //Loop over edges to compute features and update the gradient for(xi=0; xi<X->length()-1; xi++) { xj = xi+1; yi = X->getStateLabels(xi); yj = X->getStateLabels(xj); //Get nodes features pFeatureGen->getFeatures(vecFeatures,X,m,xj,xi); f = vecFeatures.getPtr(); for(k=0; k<vecFeatures.size(); k++, f++) { if(f->prevNodeState == yi && f->nodeState == yj) { phi_true += w->getValue(f->id) * f->value; localGrad[f->id] -= f->value; } else if(f->prevNodeState==ystar[xi] && f->nodeState==ystar[xj]) { phi_star += w->getValue(f->id) * f->value; localGrad[f->id] += f->value; } val = bel.belEdges[xi](f->prevNodeState,f->nodeState)*f->value; localGrad[f->id] -= val; } } // Taskar et al. (2004) vs Tsochantaridis et al. (2005) bool useTaskar = false; double scale = (useTaskar) ? 1 : hamming_loss; // Done! localGrad.multiply(scale); vecGradient.add(localGrad); return hamming_loss + scale*(exp(phi_star-bel.partition) - exp(phi_true-bel.partition)); }
double Gradient::computeGradient(dVector& vecGradient, Model* m, DataSet* X) { //Check the size of vecGradient int nbFeatures = pFeatureGen->getNumberOfFeatures(); double ans = 0.0; int TID = 0; if(vecGradient.getLength() != nbFeatures) vecGradient.create(nbFeatures); else vecGradient.set(0); // Initialize the buffers (vecFeaturesMP) for each thread #ifdef _OPENMP setMaxNumberThreads(omp_get_max_threads()); pInfEngine->setMaxNumberThreads(omp_get_max_threads()); pFeatureGen->setMaxNumberThreads(omp_get_max_threads()); #endif for(int t=0;t<nbThreadsMP;t++) { if(localGrads[t].getLength() != nbFeatures) localGrads[t].resize(1,nbFeatures,0); else localGrads[t].set(0); } //////////////////////////////////////////////////////////// // Start of parallel Region // Some weird stuff in gcc 4.1, with openmp 2.5 support #if ((_OPENMP == 200505) && __GNUG__) #pragma omp parallel \ shared(X, m, ans, nbFeatures, std::cout) \ private(TID) \ default(none) #else #pragma omp parallel \ shared(vecGradient, X, m, ans, nbFeatures, std::cout) \ private(TID) \ default(none) #endif { #ifdef _OPENMP TID = omp_get_thread_num(); #endif // Create a temporary gradient double localSum = 0; #ifdef WITH_SEQUENCE_WEIGHTS dVector tmpVecGradient(nbFeatures); #endif #pragma omp for // we can use unsigned if we have openmp 3.0 support (_OPENMP>=200805). #ifdef _OPENMP #if _OPENMP >= 200805 for(unsigned int i = 0; i< X->size(); i++){ #else for(int i = 0; i< X->size(); i++){ #endif #else for(unsigned int i = 0; i< X->size(); i++){ #endif if (m->getDebugLevel() >=2){ #pragma omp critical(output) std::cout << "Thread "<<TID<<" computes gradient for sequence " << i <<" out of " << (int)X->size() << " (Size: " << X->at(i)->length() << ")" << std::endl; } DataSequence* x = X->at(i); #ifdef WITH_SEQUENCE_WEIGHTS tmpVecGradient.set(0); localSum += computeGradient(tmpVecGradient, m, x) * x->getWeightSequence(); if(x->getWeightSequence() != 1.0) tmpVecGradient.multiply(x->getWeightSequence()); localGrads[TID].add(tmpVecGradient); #else localSum += computeGradient(localGrads[TID], m, x);// * x->getWeightSequence(); #endif } #pragma omp critical (reduce_sum) // We now put togheter the sums { if( m->getDebugLevel() >= 2){ std::cout<<"Thread "<<TID<<" update sums"<<std::endl; } ans += localSum; vecGradient.add(localGrads[TID]); } } // End of parallel Region //////////////////////////////////////////////////////////// // because we are minimizing -LogP vecGradient.negate(); // Add the regularization term double sigmaL2Square = m->getRegL2Sigma()*m->getRegL2Sigma(); if(sigmaL2Square != 0.0f) { if (m->getDebugLevel() >= 2){ std::cout << "Adding L2 norm gradient\n"; } for(int f = 0; f < nbFeatures; f++) { vecGradient[f] += (*m->getWeights())[f]/sigmaL2Square; } double weightNorm = m->getWeights()->l2Norm(false); ans += weightNorm / (2.0*m->getRegL2Sigma()*m->getRegL2Sigma()); } return ans; }
double GradientMVHCRF::computeGradientMLE(dVector& vecGradient, Model* m, DataSequence* X) { double f_value=0; // return value //////////////////////////////////////////////////////////////////////////////////// // Step 1 : Run Inference in each network to compute marginals conditioned on Y int nbSeqLabels = m->getNumberOfSequenceLabels(); std::vector<Beliefs> condBeliefs(nbSeqLabels); dVector Partition(nbSeqLabels); for(int y=0; y<nbSeqLabels; y++) { pInfEngine->computeBeliefs(condBeliefs[y], pFeatureGen, X, m, true, y); Partition[y] = condBeliefs[y].partition;; } //////////////////////////////////////////////////////////////////////////////////// // Step 2 : Compute expected values for node/edge features conditioned on Y int nbFeatures = pFeatureGen->getNumberOfFeatures(); dMatrix condEValues(nbFeatures, nbSeqLabels); feature* f; featureVector vecFeatures; iMatrix adjMat; m->getAdjacencyMatrixMV(adjMat, X); int V = m->getNumberOfViews(); int T = X->length(); int nbNodes= V*T; double val; int y, k, xi, xj; for(y=0; y<nbSeqLabels; y++) { // Loop over nodes to compute features and update the gradient for(xi=0; xi<nbNodes; xi++) { pFeatureGen->getFeatures(vecFeatures,X,m,xi,-1,y); f = vecFeatures.getPtr(); for(k=0; k<vecFeatures.size(); k++, f++) { // p(h^v_t=a|x,y) * f_k(v,t,a,x,y) val = condBeliefs[y].belStates[xi][f->nodeState] * f->value; condEValues.addValue(y, f->globalId, val); } } // Loop over edges to compute features and update the gradient for(xi=0; xi<nbNodes; xi++) { for(xj=xi+1; xj<nbNodes; xj++) { if( !adjMat(xi,xj) ) continue; pFeatureGen->getFeatures(vecFeatures,X,m,xj,xi,y); f = vecFeatures.getPtr(); for(k=0; k<vecFeatures.size(); k++, f++) { // p(h^vi_ti=a,h^vj_tj=b|x,y) * f_k(vi,ti,vj,tj,x,y) val = condBeliefs[y].belEdges[adjMat(xi,xj)-1] (f->prevNodeState,f->nodeState) * f->value; condEValues.addValue(y, f->globalId, val); } } } } //////////////////////////////////////////////////////////////////////////////////// // Step 3: Compute Joint Expected Values dVector JointEValues(nbFeatures); dVector rowJ(nbFeatures); // expected value conditioned on seqLabel Y double sumZLog = Partition.logSumExp(); for (int y=0; y<nbSeqLabels; y++) { condEValues.getRow(y, rowJ); rowJ.multiply( exp(Partition[y]-sumZLog) ); JointEValues.add(rowJ); } //////////////////////////////////////////////////////////////////////////////////// // Step 4 Compute Gradient as Exi[i,*,*] - Exi[*,*,*], that is the difference between // expected values conditioned on seqLabel Y and joint expected values if( vecGradient.getLength() != nbFeatures ) vecGradient.create(nbFeatures); condEValues.getRow(X->getSequenceLabel(), rowJ); JointEValues.negate(); rowJ.add(JointEValues); vecGradient.add(rowJ); // MLE: return log(sum_y' p(y'|xi)) - log(p(yi|xi)}) f_value = Partition.logSumExp() - Partition[X->getSequenceLabel()]; return f_value; }