bool ClassificationData::merge(const ClassificationData &otherData){ if( otherData.getNumDimensions() != numDimensions ){ errorLog << "merge(const ClassificationData &labelledData) - The number of dimensions in the labelledData (" << otherData.getNumDimensions() << ") does not match the number of dimensions of this dataset (" << numDimensions << ")" << std::endl; return false; } //The dataset has changed so flag that any previous cross validation setup will now not work crossValidationSetup = false; crossValidationIndexs.clear(); const UINT M = otherData.getNumSamples(); //Reserve the memory reserve( getNumSamples() + M ); //Add the data from the labelledData to this instance for(UINT i=0; i<M; i++){ addSample(otherData[i].getClassLabel(), otherData[i].getSample()); } //Set the class names from the dataset Vector< ClassTracker > classTracker = otherData.getClassTracker(); for(UINT i=0; i<classTracker.getSize(); i++){ setClassNameForCorrespondingClassLabel(classTracker[i].className, classTracker[i].classLabel); } //Sort the class labels sortClassLabels(); return true; }
bool LabelledClassificationData::loadDatasetFromCSVFile(string filename,UINT classLabelColumnIndex){ numDimensions = 0; datasetName = "NOT_SET"; infoText = ""; //Clear any previous data clear(); //Parse the CSV file FileParser parser; if( !parser.parseCSVFile(filename,true) ){ errorLog << "loadDatasetFromCSVFile(string filename) - Failed to parse CSV file!" << endl; return false; } if( !parser.getConsistentColumnSize() ){ errorLog << "loadDatasetFromCSVFile(string filename) - The CSV file does not have a consistent number of columns!" << endl; return false; } if( parser.getColumnSize() <= 1 ){ errorLog << "loadDatasetFromCSVFile(string filename) - The CSV file does not have enough columns! It should contain at least two columns!" << endl; return false; } //Set the number of dimensions numDimensions = parser.getColumnSize()-1; UINT classLabel = 0; UINT j = 0; UINT n = 0; VectorDouble sample(numDimensions); for(UINT i=0; i<parser.getRowSize(); i++){ //Get the class label classLabel = Util::stringToInt( parser[i][classLabelColumnIndex] ); //Get the sample data j=0; n=0; while( j != numDimensions ){ if( n != classLabelColumnIndex ){ sample[j++] = Util::stringToDouble( parser[i][n] ); } n++; } //Add the labelled sample to the dataset if( !addSample(classLabel, sample) ){ warningLog << "loadDatasetFromCSVFile(string filename) - Could not add sample " << i << " to the dataset!" << endl; } } sortClassLabels(); return true; }
bool ClassificationData::addSample(const UINT classLabel,const VectorFloat &sample){ if( sample.getSize() != numDimensions ){ if( totalNumSamples == 0 ){ warningLog << "addSample(const UINT classLabel, VectorFloat &sample) - the size of the new sample (" << sample.getSize() << ") does not match the number of dimensions of the dataset (" << numDimensions << "), setting dimensionality to: " << numDimensions << std::endl; numDimensions = sample.getSize(); }else{ errorLog << "addSample(const UINT classLabel, VectorFloat &sample) - the size of the new sample (" << sample.getSize() << ") does not match the number of dimensions of the dataset (" << numDimensions << ")" << std::endl; return false; } } //The class label must be greater than zero (as zero is used for the null rejection class label if( classLabel == GRT_DEFAULT_NULL_CLASS_LABEL && !allowNullGestureClass ){ errorLog << "addSample(const UINT classLabel, VectorFloat &sample) - the class label can not be 0!" << std::endl; return false; } //The dataset has changed so flag that any previous cross validation setup will now not work crossValidationSetup = false; crossValidationIndexs.clear(); ClassificationSample newSample(classLabel,sample); data.push_back( newSample ); totalNumSamples++; if( classTracker.getSize() == 0 ){ ClassTracker tracker(classLabel,1); classTracker.push_back(tracker); }else{ bool labelFound = false; for(UINT i=0; i<classTracker.getSize(); i++){ if( classLabel == classTracker[i].classLabel ){ classTracker[i].counter++; labelFound = true; break; } } if( !labelFound ){ ClassTracker tracker(classLabel,1); classTracker.push_back(tracker); } } //Update the class labels sortClassLabels(); return true; }
bool ClassificationData::relabelAllSamplesWithClassLabel(const UINT oldClassLabel,const UINT newClassLabel){ bool oldClassLabelFound = false; bool newClassLabelAllReadyExists = false; UINT indexOfOldClassLabel = 0; UINT indexOfNewClassLabel = 0; //Find out how many training examples we need to relabel for(UINT i=0; i<classTracker.getSize(); i++){ if( classTracker[i].classLabel == oldClassLabel ){ indexOfOldClassLabel = i; oldClassLabelFound = true; } if( classTracker[i].classLabel == newClassLabel ){ indexOfNewClassLabel = i; newClassLabelAllReadyExists = true; } } //If the old class label was not found then we can't do anything if( !oldClassLabelFound ){ return false; } //Relabel the old class labels for(UINT i=0; i<totalNumSamples; i++){ if( data[i].getClassLabel() == oldClassLabel ){ data[i].setClassLabel(newClassLabel); } } //Update the class tracler if( newClassLabelAllReadyExists ){ //Add the old sample count to the new sample count classTracker[ indexOfNewClassLabel ].counter += classTracker[ indexOfOldClassLabel ].counter; }else{ //Create a new class tracker classTracker.push_back( ClassTracker(newClassLabel,classTracker[ indexOfOldClassLabel ].counter,classTracker[ indexOfOldClassLabel ].className) ); } //Erase the old class tracker classTracker.erase( classTracker.begin() + indexOfOldClassLabel ); //Sort the class labels sortClassLabels(); return true; }
bool ClassificationData::addClass(const UINT classLabel,const std::string className){ //Check to make sure the class label does not exist for(UINT i=0; i<classTracker.size(); i++){ if( classTracker[i].classLabel == classLabel ){ return false; } } //Add the class label to the class tracker classTracker.push_back( ClassTracker(classLabel,0,className) ); //Sort the class labels sortClassLabels(); return true; }
bool ClassificationData::addClass(const UINT classLabel,const std::string className){ //Check to make sure the class label does not exist for(size_t i=0; i<classTracker.getSize(); i++){ if( classTracker[i].classLabel == classLabel ){ warningLog << "addClass(const UINT classLabel,const std::string className) - Failed to add class, it already exists! Class label: " << classLabel << std::endl; return false; } } //Add the class label to the class tracker classTracker.push_back( ClassTracker(classLabel,0,className) ); //Sort the class labels sortClassLabels(); return true; }
ClassificationData ClassificationData::split(const UINT trainingSizePercentage,const bool useStratifiedSampling){ //Partitions the dataset into a training dataset (which is kept by this instance of the ClassificationData) and //a testing/validation dataset (which is return as a new instance of the ClassificationData). The trainingSizePercentage //therefore sets the size of the data which remains in this instance and the remaining percentage of data is then added to //the testing/validation dataset //The dataset has changed so flag that any previous cross validation setup will now not work crossValidationSetup = false; crossValidationIndexs.clear(); ClassificationData trainingSet(numDimensions); ClassificationData testSet(numDimensions); trainingSet.setAllowNullGestureClass( allowNullGestureClass ); testSet.setAllowNullGestureClass( allowNullGestureClass ); //Create the random partion indexs Random random; UINT randomIndex = 0; UINT K = getNumClasses(); if( useStratifiedSampling ){ //Break the data into seperate classes Vector< Vector< UINT > > classData( K ); //Add the indexs to their respective classes for(UINT i=0; i<totalNumSamples; i++){ classData[ getClassLabelIndexValue( data[i].getClassLabel() ) ].push_back( i ); } //Randomize the order of the indexs in each of the class index buffers for(UINT k=0; k<K; k++){ std::random_shuffle(classData[k].begin(), classData[k].end()); } //Reserve the memory UINT numTrainingSamples = 0; UINT numTestSamples = 0; for(UINT k=0; k<K; k++){ UINT numTrainingExamples = (UINT) floor( Float(classData[k].size()) / 100.0 * Float(trainingSizePercentage) ); UINT numTestExamples = ((UINT)classData[k].size())-numTrainingExamples; numTrainingSamples += numTrainingExamples; numTestSamples += numTestExamples; } trainingSet.reserve( numTrainingSamples ); testSet.reserve( numTestSamples ); //Loop over each class and add the data to the trainingSet and testSet for(UINT k=0; k<K; k++){ UINT numTrainingExamples = (UINT) floor( Float(classData[k].getSize()) / 100.0 * Float(trainingSizePercentage) ); //Add the data to the training and test sets for(UINT i=0; i<numTrainingExamples; i++){ trainingSet.addSample( data[ classData[k][i] ].getClassLabel(), data[ classData[k][i] ].getSample() ); } for(UINT i=numTrainingExamples; i<classData[k].getSize(); i++){ testSet.addSample( data[ classData[k][i] ].getClassLabel(), data[ classData[k][i] ].getSample() ); } } }else{ const UINT numTrainingExamples = (UINT) floor( Float(totalNumSamples) / 100.0 * Float(trainingSizePercentage) ); //Create the random partion indexs UINT randomIndex = 0; Vector< UINT > indexs( totalNumSamples ); for(UINT i=0; i<totalNumSamples; i++) indexs[i] = i; std::random_shuffle(indexs.begin(), indexs.end()); //Reserve the memory trainingSet.reserve( numTrainingExamples ); testSet.reserve( totalNumSamples-numTrainingExamples ); //Add the data to the training and test sets for(UINT i=0; i<numTrainingExamples; i++){ trainingSet.addSample( data[ indexs[i] ].getClassLabel(), data[ indexs[i] ].getSample() ); } for(UINT i=numTrainingExamples; i<totalNumSamples; i++){ testSet.addSample( data[ indexs[i] ].getClassLabel(), data[ indexs[i] ].getSample() ); } } //Overwrite the training data in this instance with the training data of the trainingSet *this = trainingSet; //Sort the class labels in this dataset sortClassLabels(); //Sort the class labels of the test dataset testSet.sortClassLabels(); return testSet; }
bool ClassificationData::loadDatasetFromCSVFile(const std::string &filename,const UINT classLabelColumnIndex){ numDimensions = 0; datasetName = "NOT_SET"; infoText = ""; //Clear any previous data clear(); //Parse the CSV file FileParser parser; Timer timer; timer.start(); if( !parser.parseCSVFile(filename,true) ){ errorLog << "loadDatasetFromCSVFile(const std::string &filename,const UINT classLabelColumnIndex) - Failed to parse CSV file!" << std::endl; return false; } if( !parser.getConsistentColumnSize() ){ errorLog << "loadDatasetFromCSVFile(const std::string &filename,const UINT classLabelColumnIndexe) - The CSV file does not have a consistent number of columns!" << std::endl; return false; } if( parser.getColumnSize() <= 1 ){ errorLog << "loadDatasetFromCSVFile(const std::string &filename,const UINT classLabelColumnIndex) - The CSV file does not have enough columns! It should contain at least two columns!" << std::endl; return false; } //Set the number of dimensions numDimensions = parser.getColumnSize()-1; timer.start(); //Reserve the memory for the data data.resize( parser.getRowSize(), ClassificationSample(numDimensions) ); timer.start(); //Loop over the samples and add them to the data set UINT classLabel = 0; UINT j = 0; UINT n = 0; totalNumSamples = parser.getRowSize(); for(UINT i=0; i<totalNumSamples; i++){ //Get the class label classLabel = grt_from_str< UINT >( parser[i][classLabelColumnIndex] ); //Set the class label data[i].setClassLabel( classLabel ); //Get the sample data j=0; n=0; while( j != numDimensions ){ if( n != classLabelColumnIndex ){ data[i][j++] = grt_from_str< Float >( parser[i][n] ); } n++; } //Update the class tracker if( classTracker.size() == 0 ){ ClassTracker tracker(classLabel,1); classTracker.push_back(tracker); }else{ bool labelFound = false; const size_t numClasses = classTracker.size(); for(size_t i=0; i<numClasses; i++){ if( classLabel == classTracker[i].classLabel ){ classTracker[i].counter++; labelFound = true; break; } } if( !labelFound ){ ClassTracker tracker(classLabel,1); classTracker.push_back(tracker); } } } //Sort the class labels sortClassLabels(); return true; }
bool ClassificationData::loadDatasetFromFile(const std::string &filename){ std::fstream file; file.open(filename.c_str(), std::ios::in); UINT numClasses = 0; clear(); if( !file.is_open() ){ errorLog << "loadDatasetFromFile(const std::string &filename) - could not open file!" << std::endl; return false; } std::string word; //Check to make sure this is a file with the Training File Format file >> word; if(word != "GRT_LABELLED_CLASSIFICATION_DATA_FILE_V1.0"){ errorLog << "loadDatasetFromFile(const std::string &filename) - could not find file header!" << std::endl; file.close(); return false; } //Get the name of the dataset file >> word; if(word != "DatasetName:"){ errorLog << "loadDatasetFromFile(const std::string &filename) - failed to find DatasetName header!" << std::endl; errorLog << word << std::endl; file.close(); return false; } file >> datasetName; file >> word; if(word != "InfoText:"){ errorLog << "loadDatasetFromFile(const std::string &filename) - failed to find InfoText header!" << std::endl; file.close(); return false; } //Load the info text file >> word; infoText = ""; while( word != "NumDimensions:" ){ infoText += word + " "; file >> word; } //Get the number of dimensions in the training data if( word != "NumDimensions:" ){ errorLog << "loadDatasetFromFile(const std::string &filename) - failed to find NumDimensions header!" << std::endl; file.close(); return false; } file >> numDimensions; //Get the total number of training examples in the training data file >> word; if( word != "TotalNumTrainingExamples:" && word != "TotalNumExamples:" ){ errorLog << "loadDatasetFromFile(const std::string &filename) - failed to find TotalNumTrainingExamples header!" << std::endl; file.close(); return false; } file >> totalNumSamples; //Get the total number of classes in the training data file >> word; if(word != "NumberOfClasses:"){ errorLog << "loadDatasetFromFile(string filename) - failed to find NumberOfClasses header!" << std::endl; file.close(); return false; } file >> numClasses; //Resize the class counter buffer and load the counters classTracker.resize(numClasses); //Get the total number of classes in the training data file >> word; if(word != "ClassIDsAndCounters:"){ errorLog << "loadDatasetFromFile(const std::string &filename) - failed to find ClassIDsAndCounters header!" << std::endl; file.close(); return false; } for(UINT i=0; i<classTracker.getSize(); i++){ file >> classTracker[i].classLabel; file >> classTracker[i].counter; file >> classTracker[i].className; } //Check if the dataset should be scaled using external ranges file >> word; if(word != "UseExternalRanges:"){ errorLog << "loadDatasetFromFile(const std::string &filename) - failed to find UseExternalRanges header!" << std::endl; file.close(); return false; } file >> useExternalRanges; //If we are using external ranges then load them if( useExternalRanges ){ externalRanges.resize(numDimensions); for(UINT i=0; i<externalRanges.getSize(); i++){ file >> externalRanges[i].minValue; file >> externalRanges[i].maxValue; } } //Get the main training data file >> word; if( word != "LabelledTrainingData:" && word != "Data:"){ errorLog << "loadDatasetFromFile(const std::string &filename) - failed to find LabelledTrainingData header!" << std::endl; file.close(); return false; } ClassificationSample tempSample( numDimensions ); data.resize( totalNumSamples, tempSample ); for(UINT i=0; i<totalNumSamples; i++){ UINT classLabel = 0; VectorFloat sample(numDimensions,0); file >> classLabel; for(UINT j=0; j<numDimensions; j++){ file >> sample[j]; } data[i].set(classLabel, sample); } file.close(); //Sort the class labels sortClassLabels(); return true; }
LabelledClassificationData LabelledClassificationData::partition(UINT trainingSizePercentage,bool useStratifiedSampling){ //Partitions the dataset into a training dataset (which is kept by this instance of the LabelledClassificationData) and //a testing/validation dataset (which is return as a new instance of the LabelledClassificationData). The trainingSizePercentage //therefore sets the size of the data which remains in this instance and the remaining percentage of data is then added to //the testing/validation dataset //The dataset has changed so flag that any previous cross validation setup will now not work crossValidationSetup = false; crossValidationIndexs.clear(); LabelledClassificationData trainingSet(numDimensions); LabelledClassificationData testSet(numDimensions); trainingSet.setAllowNullGestureClass( allowNullGestureClass ); testSet.setAllowNullGestureClass( allowNullGestureClass ); vector< UINT > indexs( totalNumSamples ); //Create the random partion indexs Random random; UINT randomIndex = 0; if( useStratifiedSampling ){ //Break the data into seperate classes vector< vector< UINT > > classData( getNumClasses() ); //Add the indexs to their respective classes for(UINT i=0; i<totalNumSamples; i++){ classData[ getClassLabelIndexValue( data[i].getClassLabel() ) ].push_back( i ); } //Randomize the order of the indexs in each of the class index buffers for(UINT k=0; k<getNumClasses(); k++){ UINT numSamples = (UINT)classData[k].size(); for(UINT x=0; x<numSamples; x++){ //Pick a random index randomIndex = random.getRandomNumberInt(0,numSamples); //Swap the indexs SWAP(classData[k][ x ], classData[k][ randomIndex ]); } } //Loop over each class and add the data to the trainingSet and testSet for(UINT k=0; k<getNumClasses(); k++){ UINT numTrainingExamples = (UINT) floor( double(classData[k].size()) / 100.0 * double(trainingSizePercentage) ); //Add the data to the training and test sets for(UINT i=0; i<numTrainingExamples; i++){ trainingSet.addSample( data[ classData[k][i] ].getClassLabel(), data[ classData[k][i] ].getSample() ); } for(UINT i=numTrainingExamples; i<classData[k].size(); i++){ testSet.addSample( data[ classData[k][i] ].getClassLabel(), data[ classData[k][i] ].getSample() ); } } }else{ const UINT numTrainingExamples = (UINT) floor( double(totalNumSamples) / 100.0 * double(trainingSizePercentage) ); //Create the random partion indexs Random random; UINT randomIndex = 0; for(UINT i=0; i<totalNumSamples; i++) indexs[i] = i; for(UINT x=0; x<totalNumSamples; x++){ //Pick a random index randomIndex = random.getRandomNumberInt(0,totalNumSamples); //Swap the indexs SWAP(indexs[ x ],indexs[ randomIndex ]); } //Add the data to the training and test sets for(UINT i=0; i<numTrainingExamples; i++){ trainingSet.addSample( data[ indexs[i] ].getClassLabel(), data[ indexs[i] ].getSample() ); } for(UINT i=numTrainingExamples; i<totalNumSamples; i++){ testSet.addSample( data[ indexs[i] ].getClassLabel(), data[ indexs[i] ].getSample() ); } } //Overwrite the training data in this instance with the training data of the trainingSet *this = trainingSet; sortClassLabels(); testSet.sortClassLabels(); return testSet; }