Beispiel #1
0
double GradientCRF::computeGradient(dVector& vecGradient, Model* m, 
									DataSequence* X)
{
	//compute beliefs
	Beliefs bel;
	pInfEngine->computeBeliefs(bel,pFeatureGen, X, m, false);
	double phi = pFeatureGen->evaluateLabels(X,m);
	double partition = bel.partition;
	//Get adjency matrix
	uMatrix adjMat;
	m->getAdjacencyMatrix(adjMat, X);
	//Check the size of vecGradient
	int nbFeatures = pFeatureGen->getNumberOfFeatures();
	if(vecGradient.getLength() != nbFeatures)
		vecGradient.create(nbFeatures);
#if !defined(_VEC_FEATURES) && !defined(_OPENMP)
	featureVector* vecFeatures;
#endif
#if defined(_OPENMP)
	int ThreadID = omp_get_thread_num();
	if (ThreadID >= nbThreadsMP)
		ThreadID = 0;
#else
	int ThreadID = 0;
#endif

	//Loop over nodes to compute features and update the gradient
	for(int i = 0; i < X->length(); i++)
	{
		// Read the label for this state
		int s = X->getStateLabels(i);
		//Get nodes features
#if defined(_VEC_FEATURES) || defined(_OPENMP)
		pFeatureGen->getFeatures(vecFeaturesMP[ThreadID], X,m,i,-1);
		// Loop over features
		feature* pFeature = vecFeaturesMP[ThreadID].getPtr();
		for(int j = 0; j < vecFeaturesMP[ThreadID].size(); j++, pFeature++)
#else
		vecFeatures = pFeatureGen->getFeatures(X,m,i,-1);
		// Loop over features
		feature* pFeature = vecFeatures->getPtr();
		for(int j = 0; j < vecFeatures->size(); j++, pFeature++)
#endif
		{

			// If feature has same state label as the label from the
			// dataSequence, then add this to the gradient
			if(pFeature->nodeState == s)
				vecGradient[pFeature->id] += pFeature->value;
			//p(y_i=s|x)*f_k(i,s,x) is subtracted from the gradient 
			vecGradient[pFeature->id] -= bel.belStates[i][pFeature->nodeState]*pFeature->value;
		}
	}
	//Loop over edges to compute features and update the gradient
	int edgeIndex = 0;
	for(int row = 0; row < X->length(); row++) // Loop over all rows (the previous node index)
	{
		for(int col = row; col < X->length() ; col++) //Loop over all columns (the current node index)
		{
			if(adjMat(row,col) == 1)
			{
				int s1 = X->getStateLabels(row);
				int s2 = X->getStateLabels(col);

				//Get nodes features
#if defined(_VEC_FEATURES) || defined(_OPENMP)
				pFeatureGen->getFeatures(vecFeaturesMP[ThreadID], X,m,col,row);
				// Loop over features
				feature* pFeature = vecFeaturesMP[ThreadID].getPtr();
				for(int j = 0; j < vecFeaturesMP[ThreadID].size(); j++, pFeature++)
#else
				vecFeatures = pFeatureGen->getFeatures(X,m,col,row);
				// Loop over features
				feature* pFeature = vecFeatures->getPtr();
				for(int j = 0; j < vecFeatures->size(); j++, pFeature++)
#endif
				{
					// ++ Forward edge ++
					// If edge feature has same state labels as the labels from the dataSequence, then add it to the gradient
					if(pFeature->nodeState == s2 && pFeature->prevNodeState == s1)
						vecGradient[pFeature->id] += pFeature->value;

					//p(y_i=s1,y_j=s2|x)*f_k(i,j,s1,s2,x) is subtracted from the gradient 
					vecGradient[pFeature->id] -= bel.belEdges[edgeIndex](pFeature->prevNodeState,pFeature->nodeState)*pFeature->value;
				}
				edgeIndex++;
			}
		}
	}
	//Return -log instead of log() [Moved to Gradient::ComputeGradient by LP]
//	vecGradient.negate();
	return partition-phi;
}
double GradientHCRF::computeGradient(dVector& vecGradient, Model* m, DataSequence* X)
{
    int nbFeatures = pFeatureGen->getNumberOfFeatures();
    int NumSeqLabels=m->getNumberOfSequenceLabels();
    //Get adjency matrix
    uMatrix adjMat;
    m->getAdjacencyMatrix(adjMat, X);
    if(vecGradient.getLength() != nbFeatures)
        vecGradient.create(nbFeatures);
    dVector Partition;
    Partition.resize(1,NumSeqLabels);
    std::vector<Beliefs> ConditionalBeliefs(NumSeqLabels);

    // Step 1 : Run Inference in each network to compute marginals conditioned on Y
    for(int i=0; i<NumSeqLabels; i++)
    {
        pInfEngine->computeBeliefs(ConditionalBeliefs[i],pFeatureGen, X, m, true,i);
        Partition[i] = ConditionalBeliefs[i].partition;
    }
    double f_value = Partition.logSumExp() - Partition[X->getSequenceLabel()];
    // Step 2: Compute expected values for feature nodes conditioned on Y
#if !defined(_VEC_FEATURES) && !defined(_OPENMP)
    featureVector* vecFeatures;
#endif
#if defined(_OPENMP)
    int ThreadID = omp_get_thread_num();
    if (ThreadID >= nbThreadsMP)
        ThreadID = 0;
#else
    int ThreadID = 0;
#endif
    double value;
    dMatrix CEValues;
    CEValues.resize(nbFeatures,NumSeqLabels);
    //Loop over nodes to compute features and update the gradient
    for(int j=0; j<NumSeqLabels; j++) { //For every labels
        for(int i = 0; i < X->length(); i++) {//For every nodes
#if defined(_VEC_FEATURES) || defined(_OPENMP)
            pFeatureGen->getFeatures(vecFeaturesMP[ThreadID], X,m,i,-1,j);
            // Loop over features
            feature* pFeature = vecFeaturesMP[ThreadID].getPtr();
            for(int k = 0; k < vecFeaturesMP[ThreadID].size(); k++, pFeature++)
#else
            vecFeatures =pFeatureGen->getFeatures(X,m,i,-1,j);
            // Loop over features
            feature* pFeature = vecFeatures->getPtr();
            for(int k = 0; k < vecFeatures->size(); k++, pFeature++)
#endif
            {
                //p(s_i=s|x,Y) * f_k(i,s,x,y)
                value=ConditionalBeliefs[j].belStates[i][pFeature->nodeState] * pFeature->value;
                CEValues.setValue(j,pFeature->globalId, CEValues(j,pFeature->globalId) + value); // one row for each Y
            }// end for every feature
        }// end for every node
    }// end for ever Sequence Label
    // Step 3: Compute expected values for edge features conditioned on Y
    //Loop over edges to compute features and update the gradient
    for(int j=0; j<NumSeqLabels; j++) {
        int edgeIndex = 0;
        for(int row = 0; row < X->length(); row++) {
            // Loop over all rows (the previous node index)
            for(int col = row; col < X->length() ; col++) {
                //Loop over all columns (the current node index)
                if(adjMat(row,col) == 1) {
                    //Get nodes features
#if defined(_VEC_FEATURES) || defined(_OPENMP)
                    pFeatureGen->getFeatures(vecFeaturesMP[ThreadID], X,m,col,row,j);
                    // Loop over features
                    feature* pFeature = vecFeaturesMP[ThreadID].getPtr();
                    for(int k = 0; k < vecFeaturesMP[ThreadID].size(); k++, pFeature++)
#else
                    vecFeatures = pFeatureGen->getFeatures(X,m,col,row,j);
                    // Loop over features
                    feature* pFeature = vecFeatures->getPtr();
                    for(int k = 0; k < vecFeatures->size(); k++, pFeature++)
#endif
                    {
                        //p(y_i=s1,y_j=s2|x,Y)*f_k(i,j,s1,s2,x,y)
                        value=ConditionalBeliefs[j].belEdges[edgeIndex](pFeature->prevNodeState,pFeature->nodeState) * pFeature->value;
                        CEValues.setValue(j,pFeature->globalId, CEValues(j,pFeature->globalId) + value);
                    }
                    edgeIndex++;
                }
            }
        }
    }
    // Step 4: Compute Joint Expected Values
    dVector JointEValues;
    JointEValues.resize(1,nbFeatures);
    JointEValues.set(0);
    dVector rowJ;
    rowJ.resize(1,nbFeatures);
    dVector GradientVector;
    double sumZLog=Partition.logSumExp();
    for (int j=0; j<NumSeqLabels; j++)
    {
        CEValues.getRow(j, rowJ);
        rowJ.multiply(exp(Partition.getValue(j)-sumZLog));
        JointEValues.add(rowJ);
    }
    // Step 5 Compute Gradient as Exi[i,*,*] -Exi[*,*,*], that is difference
    // between expected values conditioned on Sequence Labels and Joint expected
    // values
    CEValues.getRow(X->getSequenceLabel(), rowJ); // rowJ=Expected value
    // conditioned on Sequence
    // label Y
    // [Negation moved to Gradient::ComputeGradient by LP]
//	 rowJ.negate();
    JointEValues.negate();
    rowJ.add(JointEValues);
    vecGradient.add(rowJ);
    return f_value;
}
Beispiel #3
0
double Gradient::computeGradient(dVector& vecGradient, Model* m, DataSet* X)
{
  double ans = 0.0;
  
#ifdef _OPENMP
  if( nbThreadsMP < 1 )
    nbThreadsMP = omp_get_max_threads();
  setMaxNumberThreads(nbThreadsMP);
  pInfEngine->setMaxNumberThreads(nbThreadsMP);
  pFeatureGen->setMaxNumberThreads(nbThreadsMP);
#endif
  
  //Check the size of vecGradient
  int nbFeatures = pFeatureGen->getNumberOfFeatures();
  if(vecGradient.getLength() != nbFeatures)
    vecGradient.create(nbFeatures);
  else
    vecGradient.set(0);
  
  ////////////////////////////////////////////////////////////
  // Start of parallel Region
  // Some weird stuff in gcc 4.1, with openmp 2.5 support
  //
  // Note 1: In OpenMP 2.5, the iteration variable in "for" must be
  // a signed integer variable type. In OpenMP 3.0 (_OPENMP>=200805),
  // it may  also be an unsigned integer variable type, a pointer type,
  // or a constant-time random access iterator type.
  //
  // Note 2: schedule(static | dynamic): In the dynamic schedule, there
  // is no predictable order in which the loop items are assigned to
  // different threads. Each thread asks the OpenMP runtime library for
  // an iteration number, then handles it, then asks for the next one.
  // It is thus useful when different iterations in the loop may take
  // different time to execute.
#pragma omp parallel default(none) \
  shared(vecGradient, X, m, ans, nbFeatures, std::cout)
  {
    // code inside this region runs in parallel
    dVector g(nbFeatures, COLVECTOR, 0.0);
    
#pragma omp for schedule(dynamic) reduction(+:ans)
    for(int i=0; (int)i<X->size(); i++) {
      DataSequence* x = X->at(i);
      if( m->isWeightSequence() && x->getWeightSequence() != 1.0) {
        dVector tmp(nbFeatures, COLVECTOR, 0.0);
        ans += computeGradient(tmp, m, x) * x->getWeightSequence();
        tmp.multiply(x->getWeightSequence());
        g.add(tmp);
      }
      else {
        ans += computeGradient(g, m, x);
      }
    }
    
    // We now put togheter the gradients
    // No two threads can execute a critical directive of the same name at the same time
#pragma omp critical (reduce_sum)
    {
      vecGradient.add(g);
    }
  }
  // End of parallel Region
  ////////////////////////////////////////////////////////////
  vecGradient.negate();
  
  // MaxMargin objective: min L = 0.5*\L2sigma*W*W + Loss()
  // MLE objective: min L = 0.5*1/(\L2sigma*\L2sigma)*W*W - log p(y|x)
  
  // Add the regularization term
  double scale = (m->isMaxMargin())
		? m->getRegL2Sigma()
		: 1/(double)(m->getRegL2Sigma()*m->getRegL2Sigma());
  
  if( m->isMaxMargin() )
    ans = (1/(double)X->size()) * ans;
  
  if(m->getRegL2Sigma()!=0.0f)
  {
    for(int f=0; f<nbFeatures; f++)
      vecGradient[f] += (*m->getWeights())[f]*scale;
    ans += 0.5*scale*m->getWeights()->l2Norm(false);
  }
  
  return ans;
}
Beispiel #4
0
// Based on A.2 Structured Loss in Teo et al. JMLR 2010, p343-344, 
double GradientCRF::computeGradientMaxMargin(dVector& vecGradient, Model* m, DataSequence* X)
{
	int xi, xj, yi, yj, k, nbFeatures;
	double val, phi_star=0, phi_true=0, hamming_loss=0;

	Beliefs bel;
	pInfEngine->computeBeliefs(bel,pFeatureGen, X, m, false);

	// Compute Hamming loss
	iVector ystar; dVector pystar;
	viterbiDecoding(bel,ystar,pystar);
	for(xi=0; xi<X->length(); xi++) 
		if( X->getStateLabels(xi) != ystar[xi] ) 
			hamming_loss++;

	// Compute gradients
	feature* f;
	featureVector vecFeatures;

	nbFeatures = pFeatureGen->getNumberOfFeatures();
	if(vecGradient.getLength() != nbFeatures)
		vecGradient.create(nbFeatures);

	dVector *w = m->getWeights();
	dVector localGrad(nbFeatures); 

	// Loop over nodes to compute features and update the gradient
	for(xi=0; xi<X->length(); xi++)
	{
		// Read the label for this state
		yi = X->getStateLabels(xi);

		//Get nodes features
		pFeatureGen->getFeatures(vecFeatures,X,m,xi,-1);
		f = vecFeatures.getPtr();						
		for(k=0; k<vecFeatures.size(); k++, f++) {
			if(f->nodeState==yi) {
				phi_true += w->getValue(f->id) * f->value; 
				localGrad[f->id] -= f->value;
			}
			else if(f->nodeState==ystar[xi]) {
				phi_star += w->getValue(f->id) * f->value;
				localGrad[f->id] += f->value;
			}
			val = bel.belStates[xi][f->nodeState]*f->value;
			vecGradient[f->id] -= val;
		}
	}

	//Loop over edges to compute features and update the gradient
	for(xi=0; xi<X->length()-1; xi++) {
		xj = xi+1;
		yi = X->getStateLabels(xi);
		yj = X->getStateLabels(xj);

		//Get nodes features
		pFeatureGen->getFeatures(vecFeatures,X,m,xj,xi);
		f = vecFeatures.getPtr();						
		for(k=0; k<vecFeatures.size(); k++, f++)
		{
			if(f->prevNodeState == yi && f->nodeState == yj) {
				phi_true += w->getValue(f->id) * f->value;
				localGrad[f->id] -= f->value;
			}
			else if(f->prevNodeState==ystar[xi] && f->nodeState==ystar[xj]) {
				phi_star += w->getValue(f->id) * f->value;
				localGrad[f->id] += f->value;
			}
			val = bel.belEdges[xi](f->prevNodeState,f->nodeState)*f->value;
			localGrad[f->id] -= val;
		}
	}

	// Taskar et al. (2004) vs Tsochantaridis et al. (2005)
	bool useTaskar = false; 
	double scale = (useTaskar) ? 1 : hamming_loss;

	// Done!
	localGrad.multiply(scale); 
	vecGradient.add(localGrad);
 

	return hamming_loss + scale*(exp(phi_star-bel.partition) - exp(phi_true-bel.partition));
}
Beispiel #5
0
double GradientCRF::computeGradientMLE(dVector& vecGradient, Model* m, DataSequence* X)
{
	int xi, xj, yi, yj, k, nbFeatures;
	double val, phi, partition;
	Beliefs bel;
	
	// Compute beliefs
	pInfEngine->computeBeliefs(bel,pFeatureGen, X, m, false);
	phi = pFeatureGen->evaluateLabels(X,m);
	partition = bel.partition;
 
	// Compute gradients

	// Check the size of vecGradient
	nbFeatures = pFeatureGen->getNumberOfFeatures();
	if(vecGradient.getLength() != nbFeatures)
		vecGradient.create(nbFeatures);

	feature* f;
	featureVector vecFeatures;

	// Loop over nodes to compute features and update the gradient
	for(xi=0; xi<X->length(); xi++) {
		yi = X->getStateLabels(xi);
		
		//Get nodes features
		pFeatureGen->getFeatures(vecFeatures,X,m,xi,-1);
		f = vecFeatures.getPtr();						
		for(k=0; k<vecFeatures.size(); k++, f++) 
		{
			if(f->nodeState == yi)
				vecGradient[f->id] += f->value;

			// p(y_i=s|x)*f_k(xi,s,x) is subtracted from the gradient 
			val = bel.belStates[xi][f->nodeState]*f->value;
			vecGradient[f->id] -= val;
		}
	}

	//Loop over edges to compute features and update the gradient
	for(xi=0; xi<X->length()-1; xi++) {
		xj = xi+1;
		yi = X->getStateLabels(xi);
		yj = X->getStateLabels(xj);

		//Get nodes features
		pFeatureGen->getFeatures(vecFeatures,X,m,xj,xi);
		f = vecFeatures.getPtr();						
		for(k=0; k<vecFeatures.size(); k++, f++)
		{
			if(f->prevNodeState == yi && f->nodeState == yj)
				vecGradient[f->id] += f->value;

			//p(y_i=s1,y_j=s2|x)*f_k(i,j,s1,s2,x) is subtracted from the gradient 
			val = bel.belEdges[xi](f->prevNodeState,f->nodeState)*f->value;
			vecGradient[f->id] -= val;
		}
	}

	//Return -log instead of log() [Moved to Gradient::ComputeGradient by LP]
	return partition-phi;
}
double Gradient::computeGradient(dVector& vecGradient, Model* m, DataSet* X)
{
	//Check the size of vecGradient
	int nbFeatures = pFeatureGen->getNumberOfFeatures();
	double ans = 0.0;
	int TID = 0;
	if(vecGradient.getLength() != nbFeatures)
		vecGradient.create(nbFeatures);
	else
		vecGradient.set(0);
	// Initialize the buffers (vecFeaturesMP) for each thread
#ifdef _OPENMP
	setMaxNumberThreads(omp_get_max_threads());
	pInfEngine->setMaxNumberThreads(omp_get_max_threads());
	pFeatureGen->setMaxNumberThreads(omp_get_max_threads());
#endif
	for(int t=0;t<nbThreadsMP;t++)
	{
		if(localGrads[t].getLength() != nbFeatures)
			localGrads[t].resize(1,nbFeatures,0);
		else
			localGrads[t].set(0);
	}

////////////////////////////////////////////////////////////
// Start of parallel Region

	// Some weird stuff in gcc 4.1, with openmp 2.5 support
#if ((_OPENMP == 200505) && __GNUG__)
#pragma omp parallel \
	shared(X, m, ans, nbFeatures, std::cout)	\
	private(TID) \
	default(none)
#else
#pragma omp parallel \
	shared(vecGradient, X, m, ans, nbFeatures, std::cout)	\
	private(TID) \
	default(none)
#endif
	{
#ifdef _OPENMP 
		TID = omp_get_thread_num();
#endif
		// Create a temporary gradient
		double localSum = 0;

#ifdef WITH_SEQUENCE_WEIGHTS
		dVector tmpVecGradient(nbFeatures);
#endif

#pragma omp for
		// we can use unsigned if we have openmp 3.0 support (_OPENMP>=200805).
#ifdef _OPENMP 
    #if _OPENMP >= 200805
		for(unsigned int i = 0; i< X->size(); i++){
    #else
	    for(int i = 0; i< X->size(); i++){
    #endif
#else
		for(unsigned int i = 0; i< X->size(); i++){
#endif
			if (m->getDebugLevel() >=2){
#pragma omp critical(output)
				std::cout << "Thread "<<TID<<" computes gradient for sequence " 
						  << i <<" out of " << (int)X->size() 
						  << " (Size: " <<  X->at(i)->length() << ")" << std::endl;
			}
			DataSequence* x = X->at(i);
#ifdef WITH_SEQUENCE_WEIGHTS
			tmpVecGradient.set(0);
			localSum += computeGradient(tmpVecGradient, m, x) * x->getWeightSequence();
			if(x->getWeightSequence() != 1.0)
				tmpVecGradient.multiply(x->getWeightSequence());
			localGrads[TID].add(tmpVecGradient);
#else
			localSum += computeGradient(localGrads[TID], m, x);// * x->getWeightSequence();
#endif
		}
#pragma omp critical (reduce_sum)
		// We now put togheter the sums
		{
			if( m->getDebugLevel() >= 2){
				std::cout<<"Thread "<<TID<<" update sums"<<std::endl;
			}
			ans += localSum;
			vecGradient.add(localGrads[TID]);
		}
	} 
	
// End of parallel Region
////////////////////////////////////////////////////////////

	// because we are minimizing -LogP
	vecGradient.negate();

	// Add the regularization term
	double sigmaL2Square = m->getRegL2Sigma()*m->getRegL2Sigma();
	if(sigmaL2Square != 0.0f) {
		if (m->getDebugLevel() >= 2){
			std::cout << "Adding L2 norm gradient\n";
		}
		for(int f = 0; f < nbFeatures; f++) {
			vecGradient[f] += (*m->getWeights())[f]/sigmaL2Square;
		}
		double weightNorm = m->getWeights()->l2Norm(false);
		ans += weightNorm / (2.0*m->getRegL2Sigma()*m->getRegL2Sigma());	
	}
	return ans;
}
Beispiel #7
0
double GradientMVHCRF::computeGradientMLE(dVector& vecGradient, Model* m, DataSequence* X)
{    
	double f_value=0; // return value

	////////////////////////////////////////////////////////////////////////////////////
	// Step 1 : Run Inference in each network to compute marginals conditioned on Y
 	int nbSeqLabels = m->getNumberOfSequenceLabels();
	std::vector<Beliefs> condBeliefs(nbSeqLabels);	
	dVector Partition(nbSeqLabels);
	 
	for(int y=0; y<nbSeqLabels; y++) 
	{ 
		pInfEngine->computeBeliefs(condBeliefs[y], pFeatureGen, X, m, true, y);
		Partition[y] = condBeliefs[y].partition;; 
	} 
	
	////////////////////////////////////////////////////////////////////////////////////
	// Step 2 : Compute expected values for node/edge features conditioned on Y
	int nbFeatures = pFeatureGen->getNumberOfFeatures();
	dMatrix condEValues(nbFeatures, nbSeqLabels);
	
	feature* f;
	featureVector vecFeatures;

	iMatrix adjMat;
	m->getAdjacencyMatrixMV(adjMat, X);
	
	int V = m->getNumberOfViews();
	int T = X->length(); 
	int nbNodes= V*T;

	double val;
	int y, k, xi, xj;
	
	for(y=0; y<nbSeqLabels; y++) 
	{ 
		// Loop over nodes to compute features and update the gradient
		for(xi=0; xi<nbNodes; xi++) {
			pFeatureGen->getFeatures(vecFeatures,X,m,xi,-1,y);			
			f = vecFeatures.getPtr();						
			for(k=0; k<vecFeatures.size(); k++, f++) {  				
				// p(h^v_t=a|x,y) * f_k(v,t,a,x,y)
				val = condBeliefs[y].belStates[xi][f->nodeState] * f->value;
				condEValues.addValue(y, f->globalId, val);
			} 
		} 

		// Loop over edges to compute features and update the gradient
		for(xi=0; xi<nbNodes; xi++) {
			for(xj=xi+1; xj<nbNodes; xj++) {
				if( !adjMat(xi,xj) ) continue;
				pFeatureGen->getFeatures(vecFeatures,X,m,xj,xi,y);
				f = vecFeatures.getPtr();				
				for(k=0; k<vecFeatures.size(); k++, f++) {
					// p(h^vi_ti=a,h^vj_tj=b|x,y) * f_k(vi,ti,vj,tj,x,y)
					val = condBeliefs[y].belEdges[adjMat(xi,xj)-1]
							(f->prevNodeState,f->nodeState) * f->value;
					condEValues.addValue(y, f->globalId, val);
				} 
			} 
		} 	
	} 

	////////////////////////////////////////////////////////////////////////////////////
	// Step 3: Compute Joint Expected Values
	dVector JointEValues(nbFeatures);
	dVector rowJ(nbFeatures);  // expected value conditioned on seqLabel Y
	double sumZLog = Partition.logSumExp();
	for (int y=0; y<nbSeqLabels; y++) 
	{
		condEValues.getRow(y, rowJ);
		rowJ.multiply( exp(Partition[y]-sumZLog) );
		JointEValues.add(rowJ);
	}
	
	////////////////////////////////////////////////////////////////////////////////////
	// Step 4 Compute Gradient as Exi[i,*,*] - Exi[*,*,*], that is the difference between 
	// expected values conditioned on seqLabel Y and joint expected values	
	if( vecGradient.getLength() != nbFeatures )
		vecGradient.create(nbFeatures);

	condEValues.getRow(X->getSequenceLabel(), rowJ); 
	JointEValues.negate();
	rowJ.add(JointEValues);
	vecGradient.add(rowJ);  

	// MLE: return log(sum_y' p(y'|xi)) - log(p(yi|xi)})	
	f_value = Partition.logSumExp() - Partition[X->getSequenceLabel()]; 
	return f_value;
}