bool Softmax::trainSoftmaxModel(UINT classLabel,SoftmaxModel &model,ClassificationData &data){ Float error = 0; Float errorSum = 0; Float lastErrorSum = 0; Float delta = 0; const UINT N = data.getNumDimensions(); const UINT M = data.getNumSamples(); UINT iter = 0; bool keepTraining = true; Random random; VectorFloat y(M); VectorFloat batchMean(N); Vector< UINT > randomTrainingOrder(M); Vector< VectorFloat > batchData(batchSize,VectorFloat(N)); //Init the model model.init( classLabel, N ); //Setup the target vector, the input data is relabelled as positive samples (with label 1.0) and negative samples (with label 0.0) for(UINT i=0; i<M; i++){ y[i] = data[i].getClassLabel()==classLabel ? 1.0 : 0; } //In most cases, the training data is grouped into classes (100 samples for class 1, followed by 100 samples for class 2, etc.) //This can cause a problem for stochastic gradient descent algorithm. To avoid this issue, we randomly shuffle the order of the //training samples. This random order is then used at each epoch. for(UINT i=0; i<M; i++){ randomTrainingOrder[i] = i; } std::random_shuffle(randomTrainingOrder.begin(), randomTrainingOrder.end()); //Clear any previous training results trainingResults.clear(); trainingResults.reserve( maxNumEpochs ); TrainingResult epochResult; //Run the main stochastic gradient descent training algorithm while( keepTraining ){ //Run one epoch of training using stochastic gradient descent errorSum = 0; UINT m=0; while( m < M ){ //Get the batch data for this update UINT roundSize = m+batchSize < M ? batchSize : M-m; batchMean.fill(0.0); for(UINT i=0; i<roundSize; i++){ for(UINT j=0; j<N; j++){ batchData[i][j] = data[ randomTrainingOrder[m+i] ][j]; batchMean[j] += batchData[i][j]; } } for(UINT j=0; j<N; j++) batchMean[j] /= roundSize; //Compute the error on this batch, given the current weights error = 0.0; for(UINT i=0; i<roundSize; i++){ error += y[ randomTrainingOrder[m+i] ] - model.compute( batchData[i] ); } error /= roundSize; errorSum += error; //Update the weights for(UINT j=0; j<N; j++){ model.w[j] += learningRate * error * batchMean[j]; } model.w0 += learningRate * error; m += roundSize; } //Compute the error delta = fabs( errorSum-lastErrorSum ); lastErrorSum = errorSum; //Check to see if we should stop if( delta <= minChange ){ keepTraining = false; } if( ++iter >= maxNumEpochs ){ keepTraining = false; } trainingLog << "Class: " << classLabel << " Epoch: " << iter << " TotalError: " << errorSum << " Delta: " << delta << std::endl; epochResult.setClassificationResult( iter, errorSum, this ); trainingResults.push_back( epochResult ); } return true; }
bool LogisticRegression::train(LabelledRegressionData trainingData){ const unsigned int M = trainingData.getNumSamples(); const unsigned int N = trainingData.getNumInputDimensions(); const unsigned int K = trainingData.getNumTargetDimensions(); trained = false; trainingResults.clear(); if( M == 0 ){ errorLog << "train(LabelledRegressionData trainingData) - Training data has zero samples!" << endl; return false; } if( K == 0 ){ errorLog << "train(LabelledRegressionData trainingData) - The number of target dimensions is not 1!" << endl; return false; } numInputDimensions = N; numOutputDimensions = 1; //Logistic Regression will have 1 output inputVectorRanges.clear(); targetVectorRanges.clear(); //Scale the training and validation data, if needed if( useScaling ){ //Find the ranges for the input data inputVectorRanges = trainingData.getInputRanges(); //Find the ranges for the target data targetVectorRanges = trainingData.getTargetRanges(); //Scale the training data trainingData.scale(inputVectorRanges,targetVectorRanges,0.0,1.0); } //Reset the weights Random rand; w0 = rand.getRandomNumberUniform(-0.1,0.1); w.resize(N); for(UINT j=0; j<N; j++){ w[j] = rand.getRandomNumberUniform(-0.1,0.1); } double error = 0; double lastSquaredError = 0; double delta = 0; UINT iter = 0; bool keepTraining = true; Random random; vector< UINT > randomTrainingOrder(M); TrainingResult result; trainingResults.reserve(M); //In most cases, the training data is grouped into classes (100 samples for class 1, followed by 100 samples for class 2, etc.) //This can cause a problem for stochastic gradient descent algorithm. To avoid this issue, we randomly shuffle the order of the //training samples. This random order is then used at each epoch. for(UINT i=0; i<M; i++){ randomTrainingOrder[i] = i; } std::random_shuffle(randomTrainingOrder.begin(), randomTrainingOrder.end()); //Run the main stochastic gradient descent training algorithm while( keepTraining ){ //Run one epoch of training using stochastic gradient descent totalSquaredTrainingError = 0; for(UINT m=0; m<M; m++){ //Select the random sample UINT i = randomTrainingOrder[m]; //Compute the error, given the current weights VectorDouble x = trainingData[i].getInputVector(); VectorDouble y = trainingData[i].getTargetVector(); double h = w0; for(UINT j=0; j<N; j++){ h += x[j] * w[j]; } error = y[0] - sigmoid( h ); totalSquaredTrainingError += SQR(error); //Update the weights for(UINT j=0; j<N; j++){ w[j] += learningRate * error * x[j]; } w0 += learningRate * error; } //Compute the error delta = fabs( totalSquaredTrainingError-lastSquaredError ); lastSquaredError = totalSquaredTrainingError; //Check to see if we should stop if( delta <= minChange ){ keepTraining = false; } if( ++iter >= maxNumEpochs ){ keepTraining = false; } if( isinf( totalSquaredTrainingError ) || isnan( totalSquaredTrainingError ) ){ errorLog << "train(LabelledRegressionData &trainingData) - Training failed! Total squared error is NAN. If scaling is not enabled then you should try to scale your data and see if this solves the issue." << endl; return false; } //Store the training results rootMeanSquaredTrainingError = sqrt( totalSquaredTrainingError / double(M) ); result.setRegressionResult(iter,totalSquaredTrainingError,rootMeanSquaredTrainingError); trainingResults.push_back( result ); //Notify any observers of the new training data trainingResultsObserverManager.notifyObservers( result ); trainingLog << "Epoch: " << iter << " SSE: " << totalSquaredTrainingError << " Delta: " << delta << endl; } //Flag that the algorithm has been trained regressionData.resize(1,0); trained = true; return trained; }
bool AdaBoost::train_(ClassificationData &trainingData){ //Clear any previous model clear(); if( trainingData.getNumSamples() <= 1 ){ errorLog << "train_(ClassificationData &trainingData) - There are not enough training samples to train a model! Number of samples: " << trainingData.getNumSamples() << endl; return false; } numInputDimensions = trainingData.getNumDimensions(); numClasses = trainingData.getNumClasses(); const UINT M = trainingData.getNumSamples(); const UINT POSITIVE_LABEL = WEAK_CLASSIFIER_POSITIVE_CLASS_LABEL; const UINT NEGATIVE_LABEL = WEAK_CLASSIFIER_NEGATIVE_CLASS_LABEL; double alpha = 0; const double beta = 0.001; double epsilon = 0; TrainingResult trainingResult; const UINT K = (UINT)weakClassifiers.size(); if( K == 0 ){ errorLog << "train_(ClassificationData &trainingData) - No weakClassifiers have been set. You need to set at least one weak classifier first." << endl; return false; } classLabels.resize(numClasses); models.resize(numClasses); ranges = trainingData.getRanges(); //Scale the training data if needed if( useScaling ){ trainingData.scale(ranges,0,1); } //Create the weights vector VectorDouble weights(M); //Create the error matrix MatrixDouble errorMatrix(K,M); for(UINT classIter=0; classIter<numClasses; classIter++){ //Get the class label for the current class classLabels[classIter] = trainingData.getClassLabels()[classIter]; //Set the class label of the current model models[ classIter ].setClassLabel( classLabels[classIter] ); //Setup the labels for this class, POSITIVE_LABEL == 1, NEGATIVE_LABEL == 2 ClassificationData classData; classData.setNumDimensions(trainingData.getNumDimensions()); for(UINT i=0; i<M; i++){ UINT label = trainingData[i].getClassLabel()==classLabels[classIter] ? POSITIVE_LABEL : NEGATIVE_LABEL; VectorDouble trainingSample = trainingData[i].getSample(); classData.addSample(label,trainingSample); } //Setup the initial training sample weights std::fill(weights.begin(),weights.end(),1.0/M); //Run the boosting loop bool keepBoosting = true; UINT t = 0; while( keepBoosting ){ //Pick the classifier from the family of classifiers that minimizes the total error UINT bestClassifierIndex = 0; double minError = numeric_limits<double>::max(); for(UINT k=0; k<K; k++){ //Get the k'th possible classifier WeakClassifier *weakLearner = weakClassifiers[k]; //Train the current classifier if( !weakLearner->train(classData,weights) ){ errorLog << "Failed to train weakLearner!" << endl; return false; } //Compute the weighted error for this clasifier double e = 0; double positiveLabel = weakLearner->getPositiveClassLabel(); double numCorrect = 0; double numIncorrect = 0; for(UINT i=0; i<M; i++){ //Only penalize errors double prediction = weakLearner->predict( classData[i].getSample() ); if( (prediction == positiveLabel && classData[i].getClassLabel() != POSITIVE_LABEL) || //False positive (prediction != positiveLabel && classData[i].getClassLabel() == POSITIVE_LABEL) ){ //False negative e += weights[i]; //Increase the error proportional to the weight of the example errorMatrix[k][i] = 1; //Flag that there was an error numIncorrect++; }else{ errorMatrix[k][i] = 0; //Flag that there was no error numCorrect++; } } trainingLog << "PositiveClass: " << classLabels[classIter] << " Boosting Iter: " << t << " Classifier: " << k << " WeightedError: " << e << " NumCorrect: " << numCorrect/M << " NumIncorrect: " <<numIncorrect/M << endl; if( e < minError ){ minError = e; bestClassifierIndex = k; } } epsilon = minError; //Set alpha, using the M1 weight value, small weights (close to 0) will receive a strong weight in the final classifier alpha = 0.5 * log( (1.0-epsilon)/epsilon ); trainingLog << "PositiveClass: " << classLabels[classIter] << " Boosting Iter: " << t << " Best Classifier Index: " << bestClassifierIndex << " MinError: " << minError << " Alpha: " << alpha << endl; if( isinf(alpha) ){ keepBoosting = false; trainingLog << "Alpha is INF. Stopping boosting for current class" << endl; } if( 0.5 - epsilon <= beta ){ keepBoosting = false; trainingLog << "Epsilon <= Beta. Stopping boosting for current class" << endl; } if( ++t >= numBoostingIterations ) keepBoosting = false; trainingResult.setClassificationResult(t, minError, this); trainingResults.push_back(trainingResult); trainingResultsObserverManager.notifyObservers( trainingResult ); if( keepBoosting ){ //Add the best weak classifier to the committee models[ classIter ].addClassifierToCommitee( weakClassifiers[bestClassifierIndex], alpha ); //Update the weights for the next boosting iteration double reWeight = (1.0 - epsilon) / epsilon; double oldSum = 0; double newSum = 0; for(UINT i=0; i<M; i++){ oldSum += weights[i]; //Only update the weights that resulted in an incorrect prediction if( errorMatrix[bestClassifierIndex][i] == 1 ) weights[i] *= reWeight; newSum += weights[i]; } //Normalize all the weights //This results to increasing the weights of the samples that were incorrectly labelled //While decreasing the weights of the samples that were correctly classified reWeight = oldSum/newSum; for(UINT i=0; i<M; i++){ weights[i] *= reWeight; } }else{ trainingLog << "Stopping boosting training at iteration : " << t-1 << " with an error of " << epsilon << endl; if( t-1 == 0 ){ //Add the best weak classifier to the committee (we have to add it as this is the first iteration) if( isinf(alpha) ){ alpha = 1; } //If alpha is infinite then the first classifier got everything correct models[ classIter ].addClassifierToCommitee( weakClassifiers[bestClassifierIndex], alpha ); } } } } //Normalize the weights for(UINT k=0; k<numClasses; k++){ models[k].normalizeWeights(); } //Flag that the model has been trained trained = true; //Setup the data for prediction predictedClassLabel = 0; maxLikelihood = 0; classLikelihoods.resize(numClasses); classDistances.resize(numClasses); return true; }
bool BernoulliRBM::train_(MatrixFloat &data){ const UINT numTrainingSamples = data.getNumRows(); numInputDimensions = data.getNumCols(); numOutputDimensions = numHiddenUnits; numVisibleUnits = numInputDimensions; trainingLog << "NumInputDimensions: " << numInputDimensions << std::endl; trainingLog << "NumOutputDimensions: " << numOutputDimensions << std::endl; if( randomizeWeightsForTraining ){ //Init the weights matrix weightsMatrix.resize(numHiddenUnits, numVisibleUnits); Float a = 1.0 / numVisibleUnits; for(UINT i=0; i<numHiddenUnits; i++) { for(UINT j=0; j<numVisibleUnits; j++) { weightsMatrix[i][j] = rand.getRandomNumberUniform(-a, a); } } //Init the bias units visibleLayerBias.resize( numVisibleUnits ); hiddenLayerBias.resize( numHiddenUnits ); std::fill(visibleLayerBias.begin(),visibleLayerBias.end(),0); std::fill(hiddenLayerBias.begin(),hiddenLayerBias.end(),0); }else{ if( weightsMatrix.getNumRows() != numHiddenUnits ){ errorLog << "train_(MatrixFloat &data) - Weights matrix row size does not match the number of hidden units!" << std::endl; return false; } if( weightsMatrix.getNumCols() != numVisibleUnits ){ errorLog << "train_(MatrixFloat &data) - Weights matrix row size does not match the number of visible units!" << std::endl; return false; } if( visibleLayerBias.size() != numVisibleUnits ){ errorLog << "train_(MatrixFloat &data) - Visible layer bias size does not match the number of visible units!" << std::endl; return false; } if( hiddenLayerBias.size() != numHiddenUnits ){ errorLog << "train_(MatrixFloat &data) - Hidden layer bias size does not match the number of hidden units!" << std::endl; return false; } } //Flag the model has been trained encase the user wants to save the model during a training iteration using an observer trained = true; //Make sure the data is scaled between [0 1] ranges = data.getRanges(); if( useScaling ){ for(UINT i=0; i<numTrainingSamples; i++){ for(UINT j=0; j<numInputDimensions; j++){ data[i][j] = grt_scale(data[i][j], ranges[j].minValue, ranges[j].maxValue, 0.0, 1.0); } } } const UINT numBatches = static_cast<UINT>( ceil( Float(numTrainingSamples)/batchSize ) ); //Setup the batch indexs Vector< BatchIndexs > batchIndexs( numBatches ); UINT startIndex = 0; for(UINT i=0; i<numBatches; i++){ batchIndexs[i].startIndex = startIndex; batchIndexs[i].endIndex = startIndex + batchSize; //Make sure the last batch end index is not larger than the number of training examples if( batchIndexs[i].endIndex >= numTrainingSamples ){ batchIndexs[i].endIndex = numTrainingSamples; } //Get the batch size batchIndexs[i].batchSize = batchIndexs[i].endIndex - batchIndexs[i].startIndex; //Set the start index for the next batch startIndex = batchIndexs[i].endIndex; } Timer timer; UINT i,j,n,epoch,noChangeCounter = 0; Float startTime = 0; Float alpha = learningRate; Float error = 0; Float err = 0; Float delta = 0; Float lastError = 0; Vector< UINT > indexList(numTrainingSamples); TrainingResult trainingResult; MatrixFloat wT( numVisibleUnits, numHiddenUnits ); //Stores a transposed copy of the weights vector MatrixFloat vW( numHiddenUnits, numVisibleUnits ); //Stores the weight velocity updates MatrixFloat tmpW( numHiddenUnits, numVisibleUnits ); //Stores the weight values that will be used to update the main weights matrix at each batch update MatrixFloat v1( batchSize, numVisibleUnits ); //Stores the real batch data during a batch update MatrixFloat v2( batchSize, numVisibleUnits ); //Stores the sampled batch data during a batch update MatrixFloat h1( batchSize, numHiddenUnits ); //Stores the hidden states given v1 and the current weightsMatrix MatrixFloat h2( batchSize, numHiddenUnits ); //Stores the sampled hidden states given v2 and the current weightsMatrix MatrixFloat c1( numHiddenUnits, numVisibleUnits ); //Stores h1' * v1 MatrixFloat c2( numHiddenUnits, numVisibleUnits ); //Stores h2' * v2 MatrixFloat vDiff( batchSize, numVisibleUnits ); //Stores the difference between v1-v2 MatrixFloat hDiff( batchSize, numVisibleUnits ); //Stores the difference between h1-h2 MatrixFloat cDiff( numHiddenUnits, numVisibleUnits ); //Stores the difference between c1-c2 VectorFloat vDiffSum( numVisibleUnits ); //Stores the column sum of vDiff VectorFloat hDiffSum( numHiddenUnits ); //Stores the column sum of hDiff VectorFloat visibleLayerBiasVelocity( numVisibleUnits ); //Stores the velocity update of the visibleLayerBias VectorFloat hiddenLayerBiasVelocity( numHiddenUnits ); //Stores the velocity update of the hiddenLayerBias //Set all the velocity weights to zero vW.setAllValues( 0 ); std::fill(visibleLayerBiasVelocity.begin(),visibleLayerBiasVelocity.end(),0); std::fill(hiddenLayerBiasVelocity.begin(),hiddenLayerBiasVelocity.end(),0); //Randomize the order that the training samples will be used in for(UINT i=0; i<numTrainingSamples; i++) indexList[i] = i; if( randomiseTrainingOrder ){ std::random_shuffle(indexList.begin(), indexList.end()); } //Start the main training loop timer.start(); for(epoch=0; epoch<maxNumEpochs; epoch++) { startTime = timer.getMilliSeconds(); error = 0; //Randomize the batch order std::random_shuffle(batchIndexs.begin(),batchIndexs.end()); //Run each of the batch updates for(UINT k=0; k<numBatches; k+=batchStepSize){ //Resize the data matrices, the matrices will only be resized if the rows cols are different v1.resize( batchIndexs[k].batchSize, numVisibleUnits ); h1.resize( batchIndexs[k].batchSize, numHiddenUnits ); v2.resize( batchIndexs[k].batchSize, numVisibleUnits ); h2.resize( batchIndexs[k].batchSize, numHiddenUnits ); //Setup the data pointers, using data pointers saves a few ms on large matrix updates Float **w_p = weightsMatrix.getDataPointer(); Float **wT_p = wT.getDataPointer(); Float **vW_p = vW.getDataPointer(); Float **data_p = data.getDataPointer(); Float **v1_p = v1.getDataPointer(); Float **v2_p = v2.getDataPointer(); Float **h1_p = h1.getDataPointer(); Float **h2_p = h2.getDataPointer(); Float *vlb_p = &visibleLayerBias[0]; Float *hlb_p = &hiddenLayerBias[0]; //Get the batch data UINT index = 0; for(i=batchIndexs[k].startIndex; i<batchIndexs[k].endIndex; i++){ for(j=0; j<numVisibleUnits; j++){ v1_p[index][j] = data_p[ indexList[i] ][j]; } index++; } //Copy a transposed version of the weights matrix, this is used to compute h1 and h2 for(i=0; i<numHiddenUnits; i++) for(j=0; j<numVisibleUnits; j++) wT_p[j][i] = w_p[i][j]; //Compute h1 h1.multiple(v1, wT); for(n=0; n<batchIndexs[k].batchSize; n++){ for(i=0; i<numHiddenUnits; i++){ h1_p[n][i] = sigmoidRandom( h1_p[n][i] + hlb_p[i] ); } } //Compute v2 v2.multiple(h1, weightsMatrix); for(n=0; n<batchIndexs[k].batchSize; n++){ for(i=0; i<numVisibleUnits; i++){ v2_p[n][i] = sigmoidRandom( v2_p[n][i] + vlb_p[i] ); } } //Compute h2 h2.multiple(v2,wT); for(n=0; n<batchIndexs[k].batchSize; n++){ for(i=0; i<numHiddenUnits; i++){ h2_p[n][i] = grt_sigmoid( h2_p[n][i] + hlb_p[i] ); } } //Compute c1, c2 and the difference between v1-v2 c1.multiple(h1,v1,true); c2.multiple(h2,v2,true); vDiff.subtract(v1, v2); //Compute the sum of vdiff for(j=0; j<numVisibleUnits; j++){ vDiffSum[j] = 0; for(i=0; i<batchIndexs[k].batchSize; i++){ vDiffSum[j] += vDiff[i][j]; } } //Compute the difference between h1 and h2 hDiff.subtract(h1, h2); for(j=0; j<numHiddenUnits; j++){ hDiffSum[j] = 0; for(i=0; i<batchIndexs[k].batchSize; i++){ hDiffSum[j] += hDiff[i][j]; } } //Compute the difference between c1 and c2 cDiff.subtract(c1,c2); //Update the weight velocities for(i=0; i<numHiddenUnits; i++){ for(j=0; j<numVisibleUnits; j++){ vW_p[i][j] = ((momentum * vW_p[i][j]) + (alpha * cDiff[i][j])) / batchIndexs[k].batchSize; } } for(i=0; i<numVisibleUnits; i++){ visibleLayerBiasVelocity[i] = ((momentum * visibleLayerBiasVelocity[i]) + (alpha * vDiffSum[i])) / batchIndexs[k].batchSize; } for(i=0; i<numHiddenUnits; i++){ hiddenLayerBiasVelocity[i] = ((momentum * hiddenLayerBiasVelocity[i]) + (alpha * hDiffSum[i])) / batchIndexs[k].batchSize; } //Update the weights weightsMatrix.add( vW ); //Update the bias for the visible layer for(i=0; i<numVisibleUnits; i++){ visibleLayerBias[i] += visibleLayerBiasVelocity[i]; } //Update the bias for the visible layer for(i=0; i<numHiddenUnits; i++){ hiddenLayerBias[i] += hiddenLayerBiasVelocity[i]; } //Compute the reconstruction error err = 0; for(i=0; i<batchIndexs[k].batchSize; i++){ for(j=0; j<numVisibleUnits; j++){ err += SQR( v1[i][j] - v2[i][j] ); } } error += err / batchIndexs[k].batchSize; } error /= numBatches; delta = lastError - error; lastError = error; trainingLog << "Epoch: " << epoch+1 << "/" << maxNumEpochs; trainingLog << " Epoch time: " << (timer.getMilliSeconds()-startTime)/1000.0 << " seconds"; trainingLog << " Learning rate: " << alpha; trainingLog << " Momentum: " << momentum; trainingLog << " Average reconstruction error: " << error; trainingLog << " Delta: " << delta << std::endl; //Update the learning rate alpha *= learningRateUpdate; trainingResult.setClassificationResult(epoch, error, this); trainingResults.push_back(trainingResult); trainingResultsObserverManager.notifyObservers( trainingResult ); //Check for convergance if( fabs(delta) < minChange ){ if( ++noChangeCounter >= minNumEpochs ){ trainingLog << "Stopping training. MinChange limit reached!" << std::endl; break; } }else noChangeCounter = 0; } trainingLog << "Training complete after " << epoch << " epochs. Total training time: " << timer.getMilliSeconds()/1000.0 << " seconds" << std::endl; trained = true; return true; }