int NaiveBayes<T>::Classify(const arma::Col<T>& instance)
{
	auto classVal = -1;

	//Calculate standard deviation
    stdDev = arma::sqrt(featureVariances);

	//Pre-Cook stdDev * sqrtTwoPi values for distribution calculation - Avoids uneccesary temporary - no malloc.
	stdDev *= sqrtTwoPi;
    stdDev = arma::pow(stdDev, -1);
    

	for (size_t j = 0; j < featureMeans.n_cols; ++j)
	{
		/*
		 * In the below code diffs, featureMeans,freatureVariances, 
		 * distribution, exponents, stdDev, testProbs and priorProbs are all 
		 * Armadillo based Matrix or Vector types which would otherwise need to be represented
		 * by arrays or user defined classes.  
		 */
		diffs = instance - featureMeans.col(j);
		
		/** Using log probabilities to eliminate/reduce floating point
		 *  erros when multiplication of small numbers/attributes exceeds representable min. 
		 *  
		 *  log(exp(value)) == value so the original calculation below has been
		 *  altered when calculating normal/Gaussian distribution exponents. 
		 *  
		 *  Non log based Calculation:
		 *		
		 *		exponents = arma::exp(-(arma::square(diffs))) / (2 * featureVariances.col(j));
		 *		
		 *	We can use 2 * featureVariances in the calculation below rather than squaring the standard 
		 *	deviation/stdDev values as per notation of guassian distribution in the literature - square(sqrt(value)) == value.
		 */ 
		exponents = arma::log(1 / arma::square(diffs)) - arma::log(2 * featureVariances.col(j));

		//Calculate Normal/Gaussian distribution. 
		distribution = exponents + arma::log(stdDev.col(j));

		//Use sum of log values for test instance probablities rather than raw multiply (less risk of float errors).
		testProbs(j) = std::log(priorProbs(j)) + arma::accu(distribution);
	}


    //Class val is the label with the max prob value (Classes range 0 - numClasses hence index_max() used);
	classVal = testProbs.index_max();

	return classVal;
}
void MlMaximumEntropyModel::learnLMBFGS(MlTrainingContainer* params)
{
	params->initialize();

	const MlDataSet* trainingDataSet = params->getTrainingDataSet();
	const MlDataSet* testDataSet     = params->getTestDataSet();
	const vector<size_t>& trainingIdxs = params->getTrainingIdxs();
	const vector<size_t>& testIdxs	   = params->getTestIdxs();
	const size_t numSamples = trainingIdxs.size();
	const bool performTest = (testDataSet && testIdxs.size()>0);

	if (trainingDataSet->getNumClasess() < 2)
		error("learnLMBFGS accepts only datasets with 2 or more classes, yor data has ",
			trainingDataSet->getNumClasess());

	const double  lambda  = params->getLambda();
	const size_t memorySize = params->getLmBfgsMemorySize(); 
	const size_t reportFrequency = params->getVerboseLevel();
	const double perplexityDelta  = params->getPerplexityDelta();
	const size_t numClasses  = trainingDataSet->getNumClasess();
	const size_t numFeatures = trainingDataSet->getNumBasicFeatures(); // F
	const size_t numTraining = trainingIdxs.size();    // N

	const size_t numRestarts=3;

	// data structures used for training
	vector< vector<double> >& w = weights_; // class X features
	vector< vector<double> > wOld(numClasses, vector<double>(numFeatures,0.0)); // class X features
	vector< vector<double> > wtx(numClasses, vector<double>(numTraining, 0.0));  // class X samples
	vector< vector<double> > qtx(numClasses, vector<double>(numTraining, 0.0));  // class X samples
	vector< vector<double> > q(numClasses, vector<double>(numFeatures,0.0));       // class X features
	vector< vector<double> > g(numClasses, vector<double>(numFeatures,0.0));       // class X features
	vector< vector<double> > gOld(numClasses, vector<double>(numFeatures,0.0));  // class X features

	vector< vector<float> > trainingProbs(numClasses, vector<float>(numTraining));
	vector< vector<float> > testProbs(numClasses, vector<float>(numTraining));
	vector< vector<double> > bestW(numClasses, vector<double>(numFeatures));

	// initialize weights
	if (params->getInputPath().length() > 1)
	{
		const string modelFile = params->getInputPath() + "_scr.txt";
		if (readModel(modelFile.c_str()))
			params->setIndClearWeights(false);
	}

	if (params->getIndClearWeights())
		weights_.clear();

	weights_.resize(numClasses, vector<double>(numFeatures,0.0));

	double previousPerplexity = MAX_FLOAT;
	float  bestTestError=1.0;
	size_t bestTestRound=0;
	float  bestTrainingError=1.0;
	size_t bestTrainingRound=0;

	bool terminateTraining = false;
	size_t totalRounds=0;
	size_t megaRound=0;
	for ( megaRound=0; megaRound<numRestarts; megaRound++)
	{
		// first round
		computeGradient(trainingDataSet, trainingIdxs, w, wtx, lambda, g);

		const double gtg = computeDotProduct(g,g);
		const double denominator = 1.0 / sqrt(gtg);
		for (size_t c=0; c<numClasses; c++)
			for (size_t i=0; i<numFeatures; i++)
				q[c][i]=g[c][i]*denominator;

		// qtx <- qTx
		for (size_t c=0; c<numClasses; c++)
			for (size_t i=0; i<numSamples; i++)
			{
				const MlSample& sample = trainingDataSet->getSample(trainingIdxs[i]);
				qtx[c][i]=computeDotProduct(q[c],sample.pairs);
			}

		// eta <- lineSearch(...)
		double eta = lineSearch(trainingDataSet, trainingIdxs, w, wtx, qtx, g, q, lambda);
		//cout << "eta = " << eta << endl;

		// update wtx <- wtx + eta*qtx
		for (size_t c=0; c<numClasses; c++)
			for (size_t i=0; i<wtx[c].size(); i++)
				wtx[c][i]+=eta*qtx[c][i];

		// update wOld<- w ; w <- w + eta *q ; gOld<-g
		for (size_t c=0; c<numClasses; c++)
		{
			memcpy(&wOld[c][0],&w[c][0],sizeof(double)*w[c].size());
			memcpy(&gOld[c][0],&g[c][0],sizeof(double)*g[c].size());
			for (size_t i=0; i<numFeatures; i++)
				w[c][i]+= eta*q[c][i];
		}


		// initialize memory
		vector< vector< vector<double> > > memoryU(memorySize, vector< vector<double> >(numClasses));
		vector< vector< vector<double> > > memoryD(memorySize, vector< vector<double> >(numClasses));
		vector< double > memoryAlpha(memorySize);
		size_t nextMemPosition=0;
		size_t numMemPushes=0;
		
		// iterate until convergence
		size_t round=1;
		while (round<10000)
		{
			// compute errors and report round results
			{
				double trainingLogLikelihood=0.0, testLogLikelihood=0.0;
				const double trainingError = calcErrorRateWithLogLikelihood(trainingDataSet, trainingIdxs,
																		false, &trainingLogLikelihood);
				double testError=1.0;
				if (performTest)
					testError = calcErrorRateWithLogLikelihood(testDataSet, testIdxs, false, &testLogLikelihood);

				if (reportFrequency>0 && round % reportFrequency == 0)
				{
					cout << round << "\t" << scientific << setprecision(5) << trainingLogLikelihood << "\t" << fixed << setprecision(5) << trainingError;
					if (performTest)
						cout <<"\t" << scientific << testLogLikelihood << "\t" << fixed << setprecision(5)<< testError;
					cout << endl;
				}
				
				if (performTest)
				{
					if (testError<=bestTestError)
					{
						bestTestRound=round;
						bestTestError=testError;
						for (size_t c=0; c<numClasses; c++)
							memcpy(&bestW[c][0],&w[c][0],numFeatures*sizeof(double)); // copy weights
					}
				}
				
				if (trainingError<=bestTrainingError)
				{
					bestTrainingRound=round;
					bestTrainingError=trainingError;
					if (! performTest)
					{
						for (size_t c=0; c<numClasses; c++)
							memcpy(&bestW[c][0],&w[c][0],numFeatures*sizeof(double)); // copy weights
					}
				}		
			}

			// Train new round

			computeGradient(trainingDataSet, trainingIdxs, w, wtx, lambda, g);

			double alpha=0.0;
			double sigma=0.0;
			double utu=0.0;

			// write u=g'-g and d=w'-w onto memory, use them to compute alpha and sigma
			vector< vector<double> >& u = memoryU[nextMemPosition];
			vector< vector<double> >& d = memoryD[nextMemPosition];
			for (size_t c=0; c<numClasses; c++)
			{
				const size_t numFeatures = g[c].size();
				u[c].resize(numFeatures);
				d[c].resize(numFeatures);
				for (size_t i=0; i<numFeatures; i++)
				{
					const double gDiff = g[c][i]-gOld[c][i];
					const double wDiff = w[c][i]-wOld[c][i];
					u[c][i]=gDiff;
					d[c][i]=wDiff;
					alpha += gDiff*wDiff;
					utu += gDiff*gDiff;
				}
			}
			sigma = alpha / utu;
			memoryAlpha[nextMemPosition]=alpha;

			// update memory position
			nextMemPosition++;
			if (nextMemPosition == memorySize)
				nextMemPosition = 0;
			numMemPushes++;

			// q<-g
			for (size_t c=0; c<numClasses; c++)
				memcpy(&q[c][0],&g[c][0],g[c].size()*sizeof(double));
			
			// determine memory evaluation order 1..M (M is the newest)
			vector<size_t> memOrder;
			if (numMemPushes<=memorySize)
			{
				for (size_t i=0; i<numMemPushes; i++)
					memOrder.push_back(i);
			}
			else
			{
				for (size_t i=0; i<memorySize; i++)
					memOrder.push_back((i+nextMemPosition) % memorySize);
			}

			vector<double> beta(memOrder.size(),0.0);
			for (int i=memOrder.size()-1; i>=0; i--)
			{
				const size_t m = memOrder[static_cast<size_t>(i)];
				const double alpha = memoryAlpha[m];
				
				const vector< vector<double> >& dM = memoryD[m];
				double& betaM = beta[m];

				// compute beta[m] = (memory_d[m] dot g)/alpha[m]
				for (size_t c=0; c<dM.size(); c++)
					for (size_t i=0; i<dM[c].size(); i++)
						betaM += dM[c][i]*g[c][i];
				betaM/=alpha;
				
				// q <- q - beta[m]*memory_u[m]
				const vector< vector<double> >& uM = memoryU[m]; 
				for (size_t c=0; c<q.size(); c++)
					for (size_t i=0; i<q[c].size(); i++)
						q[c][i] -= betaM * uM[c][i];

			}

			// q <- sigma*q
			for (size_t c=0; c<q.size(); c++)
				for (size_t i=0; i<q[c].size(); i++)
					q[c][i]*=sigma;


			for (size_t i=0; i<memOrder.size(); i++)
			{
				const size_t m = memOrder[static_cast<size_t>(i)];
				const vector< vector<double> >& uM = memoryU[m];
				const vector< vector<double> >& dM = memoryD[m]; 
				const double betaM = beta[m];
				const double oneOverAlpha = 1.0 / memoryAlpha[m];
				double umq = computeDotProduct(uM,q);
				for (size_t c=0; c<numClasses; c++)
					for (size_t j=0; j<q[c].size(); j++)
					{
						const double dq = dM[c][j] * (betaM - umq*oneOverAlpha);
						umq += uM[c][j]*dq;
						q[c][j] += dq;
					}
		
			}

			// q<- -q
			for (size_t c=0; c<numClasses; c++)
				for (size_t i=0; i<q[c].size(); i++)
					q[c][i]=-q[c][i];
			
			// qtx = q*X
			for (size_t i=0; i<trainingIdxs.size(); i++)
			{
				const MlSample& sample = trainingDataSet->getSample(trainingIdxs[i]);
				for (size_t c=0; c<numClasses; c++)
					qtx[c][i]=computeDotProduct(q[c],sample.pairs);
			}
			

			bool needToRestart=false;
			eta = lineSearch(trainingDataSet, trainingIdxs, w, wtx, qtx, g, q, lambda);
			if (eta<= 0.0)
			{
				// restart ?
				needToRestart = true;
			}

			// update wOld<- w ; w <- w + eta *q ; gOld<- g
			for (size_t c=0; c<numClasses; c++)
			{
				memcpy(&wOld[c][0],&w[c][0],sizeof(double)*w[c].size());
				memcpy(&gOld[c][0],&g[c][0],sizeof(double)*g[c].size());
				for (size_t i=0; i<numFeatures; i++)
					w[c][i]+= eta*q[c][i];
			}

			for (size_t c=0; c<numClasses; c++)
				for (size_t i=0; i<numSamples; i++)
					wtx[c][i]+=eta*qtx[c][i];

			round++;
			totalRounds++;
			if (terminateTraining || needToRestart)
				break;
		}
		
		if (terminateTraining)
			break;
	}

	if (! params->getIndHadInternalError())
	{
		params->setIndNormalTermination(true);
	}
	else
		cout << "Warning: encountered mathemtical error while training!" << endl;

	weights_ = bestW;

	cout << "W=" << endl;
	printVector(weights_);
	cout << endl;

	cout << "Terminated after " << totalRounds << " rounds (" << megaRound << " restarts)" << endl;
	cout << "Best training error  " << fixed << setprecision(8) << bestTrainingError << " (round " << bestTrainingRound << ")" << endl;
	if (performTest)
	cout << "Best test error      "  << bestTestError     << " (round " << bestTestRound << ")" << endl;

	indWasInitialized_ = true;

	//this->calcErrorRateWithPerplexity(trainingDataSet, trainingIdxs, true, NULL);
}