int NaiveBayes<T>::Classify(const arma::Col<T>& instance) { auto classVal = -1; //Calculate standard deviation stdDev = arma::sqrt(featureVariances); //Pre-Cook stdDev * sqrtTwoPi values for distribution calculation - Avoids uneccesary temporary - no malloc. stdDev *= sqrtTwoPi; stdDev = arma::pow(stdDev, -1); for (size_t j = 0; j < featureMeans.n_cols; ++j) { /* * In the below code diffs, featureMeans,freatureVariances, * distribution, exponents, stdDev, testProbs and priorProbs are all * Armadillo based Matrix or Vector types which would otherwise need to be represented * by arrays or user defined classes. */ diffs = instance - featureMeans.col(j); /** Using log probabilities to eliminate/reduce floating point * erros when multiplication of small numbers/attributes exceeds representable min. * * log(exp(value)) == value so the original calculation below has been * altered when calculating normal/Gaussian distribution exponents. * * Non log based Calculation: * * exponents = arma::exp(-(arma::square(diffs))) / (2 * featureVariances.col(j)); * * We can use 2 * featureVariances in the calculation below rather than squaring the standard * deviation/stdDev values as per notation of guassian distribution in the literature - square(sqrt(value)) == value. */ exponents = arma::log(1 / arma::square(diffs)) - arma::log(2 * featureVariances.col(j)); //Calculate Normal/Gaussian distribution. distribution = exponents + arma::log(stdDev.col(j)); //Use sum of log values for test instance probablities rather than raw multiply (less risk of float errors). testProbs(j) = std::log(priorProbs(j)) + arma::accu(distribution); } //Class val is the label with the max prob value (Classes range 0 - numClasses hence index_max() used); classVal = testProbs.index_max(); return classVal; }
void MlMaximumEntropyModel::learnLMBFGS(MlTrainingContainer* params) { params->initialize(); const MlDataSet* trainingDataSet = params->getTrainingDataSet(); const MlDataSet* testDataSet = params->getTestDataSet(); const vector<size_t>& trainingIdxs = params->getTrainingIdxs(); const vector<size_t>& testIdxs = params->getTestIdxs(); const size_t numSamples = trainingIdxs.size(); const bool performTest = (testDataSet && testIdxs.size()>0); if (trainingDataSet->getNumClasess() < 2) error("learnLMBFGS accepts only datasets with 2 or more classes, yor data has ", trainingDataSet->getNumClasess()); const double lambda = params->getLambda(); const size_t memorySize = params->getLmBfgsMemorySize(); const size_t reportFrequency = params->getVerboseLevel(); const double perplexityDelta = params->getPerplexityDelta(); const size_t numClasses = trainingDataSet->getNumClasess(); const size_t numFeatures = trainingDataSet->getNumBasicFeatures(); // F const size_t numTraining = trainingIdxs.size(); // N const size_t numRestarts=3; // data structures used for training vector< vector<double> >& w = weights_; // class X features vector< vector<double> > wOld(numClasses, vector<double>(numFeatures,0.0)); // class X features vector< vector<double> > wtx(numClasses, vector<double>(numTraining, 0.0)); // class X samples vector< vector<double> > qtx(numClasses, vector<double>(numTraining, 0.0)); // class X samples vector< vector<double> > q(numClasses, vector<double>(numFeatures,0.0)); // class X features vector< vector<double> > g(numClasses, vector<double>(numFeatures,0.0)); // class X features vector< vector<double> > gOld(numClasses, vector<double>(numFeatures,0.0)); // class X features vector< vector<float> > trainingProbs(numClasses, vector<float>(numTraining)); vector< vector<float> > testProbs(numClasses, vector<float>(numTraining)); vector< vector<double> > bestW(numClasses, vector<double>(numFeatures)); // initialize weights if (params->getInputPath().length() > 1) { const string modelFile = params->getInputPath() + "_scr.txt"; if (readModel(modelFile.c_str())) params->setIndClearWeights(false); } if (params->getIndClearWeights()) weights_.clear(); weights_.resize(numClasses, vector<double>(numFeatures,0.0)); double previousPerplexity = MAX_FLOAT; float bestTestError=1.0; size_t bestTestRound=0; float bestTrainingError=1.0; size_t bestTrainingRound=0; bool terminateTraining = false; size_t totalRounds=0; size_t megaRound=0; for ( megaRound=0; megaRound<numRestarts; megaRound++) { // first round computeGradient(trainingDataSet, trainingIdxs, w, wtx, lambda, g); const double gtg = computeDotProduct(g,g); const double denominator = 1.0 / sqrt(gtg); for (size_t c=0; c<numClasses; c++) for (size_t i=0; i<numFeatures; i++) q[c][i]=g[c][i]*denominator; // qtx <- qTx for (size_t c=0; c<numClasses; c++) for (size_t i=0; i<numSamples; i++) { const MlSample& sample = trainingDataSet->getSample(trainingIdxs[i]); qtx[c][i]=computeDotProduct(q[c],sample.pairs); } // eta <- lineSearch(...) double eta = lineSearch(trainingDataSet, trainingIdxs, w, wtx, qtx, g, q, lambda); //cout << "eta = " << eta << endl; // update wtx <- wtx + eta*qtx for (size_t c=0; c<numClasses; c++) for (size_t i=0; i<wtx[c].size(); i++) wtx[c][i]+=eta*qtx[c][i]; // update wOld<- w ; w <- w + eta *q ; gOld<-g for (size_t c=0; c<numClasses; c++) { memcpy(&wOld[c][0],&w[c][0],sizeof(double)*w[c].size()); memcpy(&gOld[c][0],&g[c][0],sizeof(double)*g[c].size()); for (size_t i=0; i<numFeatures; i++) w[c][i]+= eta*q[c][i]; } // initialize memory vector< vector< vector<double> > > memoryU(memorySize, vector< vector<double> >(numClasses)); vector< vector< vector<double> > > memoryD(memorySize, vector< vector<double> >(numClasses)); vector< double > memoryAlpha(memorySize); size_t nextMemPosition=0; size_t numMemPushes=0; // iterate until convergence size_t round=1; while (round<10000) { // compute errors and report round results { double trainingLogLikelihood=0.0, testLogLikelihood=0.0; const double trainingError = calcErrorRateWithLogLikelihood(trainingDataSet, trainingIdxs, false, &trainingLogLikelihood); double testError=1.0; if (performTest) testError = calcErrorRateWithLogLikelihood(testDataSet, testIdxs, false, &testLogLikelihood); if (reportFrequency>0 && round % reportFrequency == 0) { cout << round << "\t" << scientific << setprecision(5) << trainingLogLikelihood << "\t" << fixed << setprecision(5) << trainingError; if (performTest) cout <<"\t" << scientific << testLogLikelihood << "\t" << fixed << setprecision(5)<< testError; cout << endl; } if (performTest) { if (testError<=bestTestError) { bestTestRound=round; bestTestError=testError; for (size_t c=0; c<numClasses; c++) memcpy(&bestW[c][0],&w[c][0],numFeatures*sizeof(double)); // copy weights } } if (trainingError<=bestTrainingError) { bestTrainingRound=round; bestTrainingError=trainingError; if (! performTest) { for (size_t c=0; c<numClasses; c++) memcpy(&bestW[c][0],&w[c][0],numFeatures*sizeof(double)); // copy weights } } } // Train new round computeGradient(trainingDataSet, trainingIdxs, w, wtx, lambda, g); double alpha=0.0; double sigma=0.0; double utu=0.0; // write u=g'-g and d=w'-w onto memory, use them to compute alpha and sigma vector< vector<double> >& u = memoryU[nextMemPosition]; vector< vector<double> >& d = memoryD[nextMemPosition]; for (size_t c=0; c<numClasses; c++) { const size_t numFeatures = g[c].size(); u[c].resize(numFeatures); d[c].resize(numFeatures); for (size_t i=0; i<numFeatures; i++) { const double gDiff = g[c][i]-gOld[c][i]; const double wDiff = w[c][i]-wOld[c][i]; u[c][i]=gDiff; d[c][i]=wDiff; alpha += gDiff*wDiff; utu += gDiff*gDiff; } } sigma = alpha / utu; memoryAlpha[nextMemPosition]=alpha; // update memory position nextMemPosition++; if (nextMemPosition == memorySize) nextMemPosition = 0; numMemPushes++; // q<-g for (size_t c=0; c<numClasses; c++) memcpy(&q[c][0],&g[c][0],g[c].size()*sizeof(double)); // determine memory evaluation order 1..M (M is the newest) vector<size_t> memOrder; if (numMemPushes<=memorySize) { for (size_t i=0; i<numMemPushes; i++) memOrder.push_back(i); } else { for (size_t i=0; i<memorySize; i++) memOrder.push_back((i+nextMemPosition) % memorySize); } vector<double> beta(memOrder.size(),0.0); for (int i=memOrder.size()-1; i>=0; i--) { const size_t m = memOrder[static_cast<size_t>(i)]; const double alpha = memoryAlpha[m]; const vector< vector<double> >& dM = memoryD[m]; double& betaM = beta[m]; // compute beta[m] = (memory_d[m] dot g)/alpha[m] for (size_t c=0; c<dM.size(); c++) for (size_t i=0; i<dM[c].size(); i++) betaM += dM[c][i]*g[c][i]; betaM/=alpha; // q <- q - beta[m]*memory_u[m] const vector< vector<double> >& uM = memoryU[m]; for (size_t c=0; c<q.size(); c++) for (size_t i=0; i<q[c].size(); i++) q[c][i] -= betaM * uM[c][i]; } // q <- sigma*q for (size_t c=0; c<q.size(); c++) for (size_t i=0; i<q[c].size(); i++) q[c][i]*=sigma; for (size_t i=0; i<memOrder.size(); i++) { const size_t m = memOrder[static_cast<size_t>(i)]; const vector< vector<double> >& uM = memoryU[m]; const vector< vector<double> >& dM = memoryD[m]; const double betaM = beta[m]; const double oneOverAlpha = 1.0 / memoryAlpha[m]; double umq = computeDotProduct(uM,q); for (size_t c=0; c<numClasses; c++) for (size_t j=0; j<q[c].size(); j++) { const double dq = dM[c][j] * (betaM - umq*oneOverAlpha); umq += uM[c][j]*dq; q[c][j] += dq; } } // q<- -q for (size_t c=0; c<numClasses; c++) for (size_t i=0; i<q[c].size(); i++) q[c][i]=-q[c][i]; // qtx = q*X for (size_t i=0; i<trainingIdxs.size(); i++) { const MlSample& sample = trainingDataSet->getSample(trainingIdxs[i]); for (size_t c=0; c<numClasses; c++) qtx[c][i]=computeDotProduct(q[c],sample.pairs); } bool needToRestart=false; eta = lineSearch(trainingDataSet, trainingIdxs, w, wtx, qtx, g, q, lambda); if (eta<= 0.0) { // restart ? needToRestart = true; } // update wOld<- w ; w <- w + eta *q ; gOld<- g for (size_t c=0; c<numClasses; c++) { memcpy(&wOld[c][0],&w[c][0],sizeof(double)*w[c].size()); memcpy(&gOld[c][0],&g[c][0],sizeof(double)*g[c].size()); for (size_t i=0; i<numFeatures; i++) w[c][i]+= eta*q[c][i]; } for (size_t c=0; c<numClasses; c++) for (size_t i=0; i<numSamples; i++) wtx[c][i]+=eta*qtx[c][i]; round++; totalRounds++; if (terminateTraining || needToRestart) break; } if (terminateTraining) break; } if (! params->getIndHadInternalError()) { params->setIndNormalTermination(true); } else cout << "Warning: encountered mathemtical error while training!" << endl; weights_ = bestW; cout << "W=" << endl; printVector(weights_); cout << endl; cout << "Terminated after " << totalRounds << " rounds (" << megaRound << " restarts)" << endl; cout << "Best training error " << fixed << setprecision(8) << bestTrainingError << " (round " << bestTrainingRound << ")" << endl; if (performTest) cout << "Best test error " << bestTestError << " (round " << bestTestRound << ")" << endl; indWasInitialized_ = true; //this->calcErrorRateWithPerplexity(trainingDataSet, trainingIdxs, true, NULL); }