FeatureVectorPtr FeatureEncoder::EncodeAExample (FileDescConstPtr encodedFileDesc, FeatureVectorPtr src ) { FeatureVectorPtr encodedExample = new FeatureVector (numEncodedFeatures); encodedExample->MLClass (src->MLClass ()); encodedExample->PredictedClass (src->PredictedClass ()); //encodedExample->Version (src->Version ()); encodedExample->TrainWeight (src->TrainWeight ()); const float* featureData = src->FeatureData (); kkint32 x; for (x = 0; x < numOfFeatures; x++) { float featureVal = featureData [srcFeatureNums[x]]; kkint32 y = destFeatureNums[x]; switch (destWhatToDo[x]) { case FeWhatToDo::FeAsIs: { encodedExample->AddFeatureData (y, featureVal); } break; case FeWhatToDo::FeBinary: { for (kkint32 z = 0; z < cardinalityDest[x]; z++) { float bVal = ((kkint32)featureVal == z); encodedExample->AddFeatureData (y, bVal); y++; } } break; case FeWhatToDo::FeScale: { encodedExample->AddFeatureData (y, (featureVal / (float)cardinalityDest[x])); } break; } } return encodedExample; } /* EncodeAExample */
kkint32 FeatureEncoder::DetermineNumberOfNeededXspaceNodes (FeatureVectorListPtr src) const { kkint32 xSpaceNodesNeeded = 0; FeatureVectorList::const_iterator idx; for (idx = src->begin (); idx != src->end (); ++idx) { FeatureVectorPtr fv = *idx; const float* featureData = fv->FeatureData (); for (kkint32 x = 0; x < numOfFeatures; x++) { float featureVal = featureData [srcFeatureNums[x]]; kkint32 y = destFeatureNums[x]; switch (destWhatToDo[x]) { case FeWhatToDo::FeAsIs: if (featureVal != 0.0) xSpaceNodesNeeded++; break; case FeWhatToDo::FeBinary: for (kkint32 z = 0; z < cardinalityDest[x]; z++) { float bVal = ((kkint32)featureVal == z); if (bVal != 0.0) xSpaceNodesNeeded++; y++; } break; case FeWhatToDo::FeScale: if (featureVal != (float)0.0) xSpaceNodesNeeded++; break; } } xSpaceNodesNeeded++; } return xSpaceNodesNeeded; } /* DetermineNumberOfNeededXspaceNodes */
void FeatureFileConverter::ConvertData () { cout << endl << "Saving [" << data->QueueSize () << "] records to data file[" << destFileName << "]" << endl << endl; bool successful = false; int numOfFeatures = data->NumOfFeatures (); int numWithAllZeros = 0; { FeatureVectorListPtr newData = new FeatureVectorList (srcFileDesc, true, log); // Will store examples that have all zero's for all features in "zeroData" // container. This way they can be deleted from memory later and not result // in a memory leak. This has to be done because they are not going to // be placed into newData which is going to become the owner of all the // examples. FeatureVectorListPtr zeroData = new FeatureVectorList (srcFileDesc, true, log); // How many have all 0's for feature data. FeatureVectorList::iterator idx; for (idx = data->begin (); idx != data->end (); idx++) { FeatureVectorPtr i = *idx; bool allZeros = true; for (int featureNum = 0; featureNum < numOfFeatures; featureNum++) { allZeros = (i->FeatureData (featureNum) == 0.0f); if (!allZeros) break; } if (allZeros) { numWithAllZeros++; zeroData->PushOnBack (i); } else { newData->PushOnBack (i); } } data->Owner (false); delete data; data = newData; delete zeroData; } *report << endl << endl << "Num of data items with all zero feature data [" << numWithAllZeros << "]" << endl << endl; *report << data->ClassStatisticsStr (); *report << endl << endl << endl; if (statistics) { *report << "Class Statistics:" << endl; data->PrintClassStatistics (*report); *report << endl << endl; *report << "Feature Statistics:" << endl; data->PrintFeatureStatisticsByClass (*report); } if (enumerateClasses) { // We are going to change the name of the classes to numbers enumberated by className MLClassConstListPtr mlClasses = data->ExtractMLClassConstList (); mlClasses->SortByName (); MLClassConstListPtr newClassNames = new MLClassConstList (); int classIdx = 0; MLClassConstList::iterator idx; for (idx = mlClasses->begin (); idx != mlClasses->end (); idx++) { KKStr newName = StrFormatInt (classIdx, "zzz0"); MLClassConstPtr mlClass = newClassNames->GetMLClassPtr (newName); classIdx++; } FeatureVectorList::iterator idx2; for (idx2 = data->begin (); idx2 != data->end (); idx2++) { MLClassConstPtr c = (*idx2)->MLClass (); int classIndex = mlClasses->PtrToIdx (c); (*idx2)->MLClass (newClassNames->IdxToPtr (classIndex)); } delete mlClasses; mlClasses = NULL; delete newClassNames; newClassNames = NULL; } if (encodeFeatureData) { EncodeFeatureData (); } else { uint numExamplesWritten = 0; destFileFormat->SaveFeatureFile (destFileName, *features, *data, numExamplesWritten, cancelFlag, successful, log ); } } /* ConvertData */
/** * @brief Converts a single example into the svm_problem format. * @param[in] The example That we're converting * @param[in] The row kkint32 he svm_problem structure that the converted data will be stored */ void FeatureEncoder::EncodeAExample (FeatureVectorPtr example, svm_node* xSpace, kkint32& xSpaceUsed ) { const float* featureData = example->FeatureData (); kkint32 x; xSpaceUsed = 0; for (x = 0; x < numOfFeatures; x++) { float featureVal = featureData [srcFeatureNums[x]]; kkint32 y = destFeatureNums[x]; if (y >= xSpaceNeededPerExample) { KKStr errMsg (128); errMsg << "FeatureEncoder::EncodeAExample ***ERROR*** xSpaceNeededPerExample[" << xSpaceNeededPerExample << "]."; cerr << endl << "FeatureEncoder::EncodeAExample *** ERROR ***" << endl << " " << errMsg << endl << endl; throw KKException (errMsg); } switch (destWhatToDo[x]) { case FeWhatToDo::FeAsIs: { if (featureVal != 0.0) { xSpace[xSpaceUsed].index = y; xSpace[xSpaceUsed].value = featureVal; xSpaceUsed++; } } break; case FeWhatToDo::FeBinary: { for (kkint32 z = 0; z < cardinalityDest[x]; z++) { float bVal = ((kkint32)featureVal == z); if (bVal != 0.0) { xSpace[xSpaceUsed].index = y; xSpace[xSpaceUsed].value = bVal; xSpaceUsed++; } y++; } } break; case FeWhatToDo::FeScale: { if (featureVal != (float)0.0) { xSpace[xSpaceUsed].index = y; xSpace[xSpaceUsed].value = featureVal / (float)cardinalityDest[x]; xSpaceUsed++; } } break; } } xSpace[xSpaceUsed].index = -1; xSpace[xSpaceUsed].value = -1; xSpaceUsed++; } /* EncodeAExample */