int main() { char * SampleSet = "iris.data.txt"; int k = 3; CKNN cknn(k); cknn.Load(SampleSet); cknn.get_accuracy(); system("pause"); return 0 ; }
/*! \section example51 Example 51: (DOS) Result regularization using secondary criterion. It is known that feature selection may over-fit. As in the case of over-trained classifiers, over-selected feature subsets may generalize poorly. This unwanted effect can lead to serious degradation of generalization ability, i.e., model or decision-rule behavior on previously unknown data. It has been suggested (Raudys: Feature Over-Selection, LNCS 4109, 2006, or Somol et al., ICPR 2010) that preferring a subset with slightly-worse-than-maximal criterion value can actually improve generalization. FST3 makes this possible through result tracking and subsequent selection of alternative solution by means of secondary criterion maximization. In this example we show a 3-Nearest Neighbor Wrapper based feature selection process, where the final result is eventually chosen among a group of solutions close enough to the achieved maximum, so as to optimize the secondary criterion. The group of solutions to select from is defined by means of a user-selected margin value (permitted primary criterion value difference from the known maximum). In this case we show that even the simplest secondary criterion (mere preference of smaller subsets) can improve classifcation accuracy on previously unknown data. */ int main() { try{ typedef double RETURNTYPE; typedef double DATATYPE; typedef double REALTYPE; typedef unsigned int IDXTYPE; typedef unsigned int DIMTYPE; typedef short BINTYPE; typedef FST::Subset<BINTYPE, DIMTYPE> SUBSET; typedef FST::Data_Intervaller<std::vector<FST::Data_Interval<IDXTYPE> >,IDXTYPE> INTERVALLER; typedef boost::shared_ptr<FST::Data_Splitter<INTERVALLER,IDXTYPE> > PSPLITTER; typedef FST::Data_Splitter_CV<INTERVALLER,IDXTYPE> SPLITTERCV; typedef FST::Data_Splitter_5050<INTERVALLER,IDXTYPE> SPLITTER5050; typedef FST::Data_Accessor_Splitting_MemTRN<DATATYPE,IDXTYPE,INTERVALLER> DATAACCESSOR; // uncomment for TRN data format //typedef FST::Data_Accessor_Splitting_MemARFF<DATATYPE,IDXTYPE,INTERVALLER> DATAACCESSOR; // uncomment for ARFF data format typedef FST::Distance_Euclid<DATATYPE,DIMTYPE,SUBSET> DISTANCE; typedef FST::Classifier_kNN<RETURNTYPE,DATATYPE,IDXTYPE,DIMTYPE,SUBSET,DATAACCESSOR,DISTANCE> CLASSIFIERKNN; typedef FST::Criterion_Wrapper<RETURNTYPE,SUBSET,CLASSIFIERKNN,DATAACCESSOR> WRAPPERKNN; typedef FST::Criterion_Subset_Size<RETURNTYPE,SUBSET> CRITSUBSIZE; typedef FST::Criterion_Negative<CRITSUBSIZE,RETURNTYPE,SUBSET> NEGATIVECRIT; typedef FST::Sequential_Step_Straight<RETURNTYPE,DIMTYPE,SUBSET,WRAPPERKNN> EVALUATOR; typedef FST::Result_Tracker_Regularizer<RETURNTYPE,IDXTYPE,DIMTYPE,SUBSET,NEGATIVECRIT> TRACKER; std::cout << "Starting Example 51: (DOS) Result regularization using secondary criterion..." << std::endl; // keep second half of data for independent testing of final classification performance PSPLITTER dsp_outer(new SPLITTER5050()); // in the course of search use the first half of data by 3-fold cross-validation in wrapper FS criterion evaluation PSPLITTER dsp_inner(new SPLITTERCV(3)); // do not scale data boost::shared_ptr<FST::Data_Scaler<DATATYPE> > dsc(new FST::Data_Scaler_void<DATATYPE>()); // set-up data access boost::shared_ptr<std::vector<PSPLITTER> > splitters(new std::vector<PSPLITTER>); splitters->push_back(dsp_outer); splitters->push_back(dsp_inner); boost::shared_ptr<DATAACCESSOR> da(new DATAACCESSOR("data/waveform_40.trn",splitters,dsc)); da->initialize(); // initiate access to split data parts da->setSplittingDepth(0); if(!da->getFirstSplit()) throw FST::fst_error("50/50 data split failed."); da->setSplittingDepth(1); if(!da->getFirstSplit()) throw FST::fst_error("3-fold cross-validation failure."); // initiate the storage for subset to-be-selected boost::shared_ptr<SUBSET> sub(new SUBSET(da->getNoOfFeatures())); sub->deselect_all(); // set-up 3-Nearest Neighbor classifier based on Euclidean distances boost::shared_ptr<CLASSIFIERKNN> cknn(new CLASSIFIERKNN); cknn->set_k(3); // wrap the 3-NN classifier to enable its usage as FS criterion (criterion value will be estimated by 3-fold cross-val.) boost::shared_ptr<WRAPPERKNN> wknn(new WRAPPERKNN); wknn->initialize(cknn,da); // set-up the standard sequential search step object (option: hybrid, ensemble) boost::shared_ptr<EVALUATOR> eval(new EVALUATOR); // set-up Dynamic Oscillating Search procedure FST::Search_DOS<RETURNTYPE,DIMTYPE,SUBSET,WRAPPERKNN,EVALUATOR> srch(eval); srch.set_delta(3); // set-up the regularizing result tracker boost::shared_ptr<TRACKER> tracker(new TRACKER); // register the result tracker with the used search step object eval->enable_result_tracking(tracker); // run the search std::cout << "Feature selection setup:" << std::endl << *da << std::endl << srch << std::endl << *wknn << std::endl << *tracker << std::endl << std::endl; RETURNTYPE critval_train, critval_test; srch.set_output_detail(FST::NORMAL); // set FST::SILENT to disable all text output in the course of search (FST::NORMAL is default) if(!srch.search(0,critval_train,sub,wknn,std::cout)) throw FST::fst_error("Search not finished."); // (optionally) validate result by estimating kNN accuracy on selected feature sub-space on independent test data da->setSplittingDepth(0); cknn->train(da,sub); cknn->test(critval_test,da); std::cout << "Validated "<<cknn->get_k()<<"-NN accuracy=" << critval_test << std::endl << std::endl; // set-up the secondary criterion (regularization criterion); in this case to minimize subset size boost::shared_ptr<CRITSUBSIZE> critsubsiz(new CRITSUBSIZE); //Criterion_Subset_Size does not need to be initialized boost::shared_ptr<NEGATIVECRIT> regulcrit(new NEGATIVECRIT(critsubsiz)); //Criterion_Negative does not need to be initialized // select final solution among those recorded by tracker (show more alternatives for various margins) tracker->set_output_detail(FST::NORMAL); // set FST::SILENT to disable all text output in the course of search (FST::NORMAL is default) for(unsigned int i=1; i<10; i++) { RETURNTYPE margin=(double)i*0.001; da->setSplittingDepth(1); // necessary with criteria than need access to training data if(!tracker->optimize_within_margin(margin,critval_train,critval_test,sub,regulcrit)) throw FST::fst_error("tracker->optimize_within_margin() failed."); std::cout << std::endl << "Regularized (margin="<<margin<<") result: " << std::endl << *sub << "Criterion value=" << critval_train << std::endl; // (optionally) validate result by estimating kNN accuracy on selected feature sub-space on independent test data da->setSplittingDepth(0); cknn->train(da,sub); cknn->test(critval_test,da); std::cout << "Validated "<<cknn->get_k()<<"-NN accuracy=" << critval_test << std::endl << std::endl; } } catch(FST::fst_error &e) {std::cerr<<"FST ERROR: "<< e.what() << ", code=" << e.code() << std::endl;} catch(std::exception &e) {std::cerr<<"non-FST ERROR: "<< e.what() << std::endl;} return 0; }
/*! \section example40 Example 40: Exhaustive (optimal) feature selection. Selects features exhaustively, i.e., evaluates all possible feature combinations. This approach is guaranteed to find optimum with respect to the chosen criterion, but its exponential time complexity renders it prohibitive for even moderately dimensioned tasks. Here it is demonstrated on 15-dimensional data with 3-NN (based on L1.5 distance) wrapper classification accuracy as FS criterion - note how time consuming the computation is even for relatively low-dimensional case. Classification accuracy (i.e, FS wrapper criterion value) is estimated on the first 50% of data samples by means of 3-fold cross-validation. The final classification performance on the selected subspace is eventually validated on the second 50% of data. Exhaustive search is called here in d-optimizing setting, invoked by parameter 0 in search(0,...), which is otherwise used to specify the required subset size. Optional result tracking is employed here to reveal duplicate solutions yielding the same maximum criterion value (see also \ref example60). */ int main() { try{ typedef double RETURNTYPE; typedef double DATATYPE; typedef double REALTYPE; typedef unsigned int IDXTYPE; typedef unsigned int DIMTYPE; typedef short BINTYPE; typedef FST::Subset<BINTYPE, DIMTYPE> SUBSET; typedef FST::Data_Intervaller<std::vector<FST::Data_Interval<IDXTYPE> >,IDXTYPE> INTERVALLER; typedef boost::shared_ptr<FST::Data_Splitter<INTERVALLER,IDXTYPE> > PSPLITTER; typedef FST::Data_Splitter_CV<INTERVALLER,IDXTYPE> SPLITTERCV; typedef FST::Data_Splitter_5050<INTERVALLER,IDXTYPE> SPLITTER5050; typedef FST::Data_Accessor_Splitting_MemTRN<DATATYPE,IDXTYPE,INTERVALLER> DATAACCESSOR; // uncomment for TRN data format //typedef FST::Data_Accessor_Splitting_MemARFF<DATATYPE,IDXTYPE,INTERVALLER> DATAACCESSOR; // uncomment for ARFF data format typedef FST::Distance_Lp<DATATYPE,REALTYPE,DIMTYPE,SUBSET,3,2> DISTANCE; typedef FST::Classifier_kNN<RETURNTYPE,DATATYPE,IDXTYPE,DIMTYPE,SUBSET,DATAACCESSOR,DISTANCE> CLASSIFIERKNN; typedef FST::Criterion_Wrapper<RETURNTYPE,SUBSET,CLASSIFIERKNN,DATAACCESSOR> WRAPPERKNN; typedef FST::Result_Tracker_Dupless<RETURNTYPE,IDXTYPE,DIMTYPE,SUBSET> TRACKER; std::cout << "Starting Example 40: Exhaustive (optimal) feature selection..." << std::endl; // keep second half of data for independent testing of final classification performance PSPLITTER dsp_outer(new SPLITTER5050()); // in the course of search use the first half of data by 3-fold cross-validation in wrapper FS criterion evaluation PSPLITTER dsp_inner(new SPLITTERCV(3)); // do not scale data boost::shared_ptr<FST::Data_Scaler<DATATYPE> > dsc(new FST::Data_Scaler_void<DATATYPE>()); // set-up data access boost::shared_ptr<std::vector<PSPLITTER> > splitters(new std::vector<PSPLITTER>); splitters->push_back(dsp_outer); splitters->push_back(dsp_inner); boost::shared_ptr<DATAACCESSOR> da(new DATAACCESSOR("data/speech_15.trn",splitters,dsc)); da->initialize(); // initiate access to split data parts da->setSplittingDepth(0); if(!da->getFirstSplit()) throw FST::fst_error("50/50 data split failed."); da->setSplittingDepth(1); if(!da->getFirstSplit()) throw FST::fst_error("3-fold cross-validation failure."); // initiate the storage for subset to-be-selected boost::shared_ptr<SUBSET> sub(new SUBSET(da->getNoOfFeatures())); sub->deselect_all(); // set-up 3-Nearest Neighbor classifier based on Euclidean distances boost::shared_ptr<CLASSIFIERKNN> cknn(new CLASSIFIERKNN); cknn->set_k(3); // wrap the 3-NN classifier to enable its usage as FS criterion (criterion value will be estimated by 3-fold cross-val.) boost::shared_ptr<WRAPPERKNN> wknn(new WRAPPERKNN); wknn->initialize(cknn,da); // set-up Exhaustive Search procedure FST::Search_Exhaustive<RETURNTYPE,DIMTYPE,SUBSET,WRAPPERKNN> srch; // set-up result tracker to enable logging of candidate solutions, ordered descending by value // (optionally limit the number of kept records to 50000 highest valued to prevent memory exhaustion due to possibly excessive number of candidates) boost::shared_ptr<TRACKER> tracker(new TRACKER(50000)); // let the tracker register only solution no worse than "the best known criterion value minus 0.05" tracker->set_inclusion_margin(0.05); // register the result tracker with the used search object srch.enable_result_tracking(tracker); // run the search std::cout << "Feature selection setup:" << std::endl << *da << std::endl << srch << std::endl << *wknn << std::endl << std::endl; RETURNTYPE critval_train, critval_test; srch.set_output_detail(FST::NORMAL); // set FST::SILENT to disable all text output in the course of search (FST::NORMAL is default) if(!srch.search(0,critval_train,sub,wknn,std::cout)) throw FST::fst_error("Search not finished."); // (optionally) validate result by estimating kNN accuracy on selected feature sub-space on independent test data da->setSplittingDepth(0); cknn->train(da,sub); cknn->test(critval_test,da); std::cout << "Validated "<<cknn->get_k()<<"-NN accuracy=" << critval_test << std::endl << std::endl; // report tracker contents std::cout << "Result tracker records " << tracker->size(0.0) << " solutions with criterion value equal to " << critval_train << "." << std::endl << std::endl; for(unsigned int i=1;i<10;i++) std::cout << "Result tracker records " << tracker->size((double)i*0.005) << " solutions with criterion value greater or equal to " << critval_train-(double)i*0.005 << "." << std::endl << std::endl; TRACKER::PResultRec result; if(tracker->get_first(result) && tracker->size(0.0)>1) { RETURNTYPE firstvalue=result->value; std::cout << "All recorded feature subsets yielding the same best known criterion value " << firstvalue << ":" << std::endl; while(tracker->get_next(result) && result->value==firstvalue) std::cout << *(result->sub) << std::endl; } } catch(FST::fst_error &e) {std::cerr<<"FST ERROR: "<< e.what() << ", code=" << e.code() << std::endl;} catch(std::exception &e) {std::cerr<<"non-FST ERROR: "<< e.what() << std::endl;} return 0; }
/*! \section example61 Example 61: Feature selection that respects pre-specified feature weights. In many applications it is desirable to optimize feature subsets not only with respect to the primary objective (e.g., decision rule accuracy), but also with respect to additional factors like known feature acquisition cost. In many cases there might be only negligible difference in discriminatory ability among several features, while the cost of measuring their value may differ a lot. In such a case it is certainly better to select the cheaper feature. In other scenarios it might be even advantageous to trade a minor degradation of classifcation accuracy for substantial saving in measurement acquisition cost. For such cases FST3 implements a mechanism that allows to control the feature accuracy vs. feature cost trade-off. It is made possible through result tracking and subsequent selection of alternative solution so as to minimize the sum of pre-specified feature weights. The lower-weight solution is selected from the pool of all known solutions that differ from the best one by less than a user-specifed margin (permitted primary criterion value difference from the known maximum value). In this example we illustrate how to add the respective mechanism to standard wrapper based feature selection. Here we select features so as to maximize 3-Nearest Neighbor accuracy; then several lower-weight solutions are identified and validated, for various margin values. */ int main() { try{ typedef double RETURNTYPE; typedef double DATATYPE; typedef double REALTYPE; typedef unsigned int IDXTYPE; typedef unsigned int DIMTYPE; typedef short BINTYPE; typedef FST::Subset<BINTYPE, DIMTYPE> SUBSET; typedef FST::Data_Intervaller<std::vector<FST::Data_Interval<IDXTYPE> >,IDXTYPE> INTERVALLER; typedef boost::shared_ptr<FST::Data_Splitter<INTERVALLER,IDXTYPE> > PSPLITTER; typedef FST::Data_Splitter_CV<INTERVALLER,IDXTYPE> SPLITTERCV; typedef FST::Data_Splitter_5050<INTERVALLER,IDXTYPE> SPLITTER5050; typedef FST::Data_Accessor_Splitting_MemTRN<DATATYPE,IDXTYPE,INTERVALLER> DATAACCESSOR; // uncomment for TRN data format //typedef FST::Data_Accessor_Splitting_MemARFF<DATATYPE,IDXTYPE,INTERVALLER> DATAACCESSOR; // uncomment for ARFF data format typedef FST::Distance_Euclid<DATATYPE,DIMTYPE,SUBSET> DISTANCE; typedef FST::Classifier_kNN<RETURNTYPE,DATATYPE,IDXTYPE,DIMTYPE,SUBSET,DATAACCESSOR,DISTANCE> CLASSIFIERKNN; typedef FST::Criterion_Wrapper<RETURNTYPE,SUBSET,CLASSIFIERKNN,DATAACCESSOR> WRAPPERKNN; typedef FST::Criterion_Sum_Of_Weights<RETURNTYPE,DIMTYPE,SUBSET> WEIGHCRIT; typedef FST::Criterion_Negative<WEIGHCRIT,RETURNTYPE,SUBSET> NEGATIVECRIT; typedef FST::Sequential_Step_Straight<RETURNTYPE,DIMTYPE,SUBSET,WRAPPERKNN> EVALUATOR; typedef FST::Result_Tracker_Regularizer<RETURNTYPE,IDXTYPE,DIMTYPE,SUBSET,NEGATIVECRIT> TRACKER; std::cout << "Starting Example 61: Feature selection that respects pre-specified feature weights..." << std::endl; // keep second half of data for independent testing of final classification performance PSPLITTER dsp_outer(new SPLITTER5050()); // in the course of search use the first half of data by 3-fold cross-validation in wrapper FS criterion evaluation PSPLITTER dsp_inner(new SPLITTERCV(3)); // do not scale data boost::shared_ptr<FST::Data_Scaler<DATATYPE> > dsc(new FST::Data_Scaler_void<DATATYPE>()); // set-up data access boost::shared_ptr<std::vector<PSPLITTER> > splitters(new std::vector<PSPLITTER>); splitters->push_back(dsp_outer); splitters->push_back(dsp_inner); boost::shared_ptr<DATAACCESSOR> da(new DATAACCESSOR("data/speech_15.trn",splitters,dsc)); da->initialize(); // initiate access to split data parts da->setSplittingDepth(0); if(!da->getFirstSplit()) throw FST::fst_error("50/50 data split failed."); da->setSplittingDepth(1); if(!da->getFirstSplit()) throw FST::fst_error("3-fold cross-validation failure."); // initiate the storage for subset to-be-selected boost::shared_ptr<SUBSET> sub(new SUBSET(da->getNoOfFeatures())); sub->deselect_all(); // set-up 3-Nearest Neighbor classifier based on Euclidean distances boost::shared_ptr<CLASSIFIERKNN> cknn(new CLASSIFIERKNN); cknn->set_k(3); // wrap the 3-NN classifier to enable its usage as FS criterion (criterion value will be estimated by 3-fold cross-val.) boost::shared_ptr<WRAPPERKNN> wknn(new WRAPPERKNN); wknn->initialize(cknn,da); // set-up the standard sequential search step object (option: hybrid, ensemble, threaded, etc.) boost::shared_ptr<EVALUATOR> eval(new EVALUATOR); // set-up Sequential Forward Floating Selection search procedure FST::Search_SFFS<RETURNTYPE,DIMTYPE,SUBSET,WRAPPERKNN,EVALUATOR> srch(eval); srch.set_search_direction(FST::FORWARD); // set-up tracker of intermediate results boost::shared_ptr<TRACKER> tracker(new TRACKER); // register the result tracker with the used search step object eval->enable_result_tracking(tracker); // run the search std::cout << "Feature selection setup:" << std::endl << *da << std::endl << srch << std::endl << *wknn << std::endl << *tracker << std::endl << std::endl; RETURNTYPE critval_train, critval_test; if(!srch.search(0,critval_train,sub,wknn,std::cout)) throw FST::fst_error("Search not finished."); // (optionally) validate result by estimating kNN accuracy on selected feature sub-space on independent test data da->setSplittingDepth(0); cknn->train(da,sub); cknn->test(critval_test,da); if(!wknn->evaluate(critval_train,sub)) throw FST::fst_error("crit call failure."); std::cout << "Validated "<<cknn->get_k()<<"-NN accuracy=" << critval_test << ", crit value="<< critval_train << std::endl << std::endl; // set-up the secondary criterion to minimize the sum of feature weights // (note that the concrete weight values shown here are sample only) RETURNTYPE feature_cost[]={1, 1.2, 1, 1.3, 1.02, 2.4, 3.9, 1.2, 7.1, 22, 9.52, 1.08, 3.27, 1.44, 1.04}; assert(sizeof(feature_cost)/sizeof(RETURNTYPE)==da->getNoOfFeatures()); boost::shared_ptr<WEIGHCRIT> weightsumcrit(new WEIGHCRIT); weightsumcrit->initialize(da->getNoOfFeatures(),feature_cost); boost::shared_ptr<NEGATIVECRIT> critminw(new NEGATIVECRIT(weightsumcrit)); // select final solution among those recorded by tracker (show more alternatives for various margins) for(unsigned int i=0; i<10; i++) { const RETURNTYPE margin=(double)i*0.005; if(!tracker->optimize_within_margin(margin,critval_train,critval_test,sub,critminw)) throw FST::fst_error("tracker2->optimize_within_margin() failed."); std::cout << std::endl << "Weight-optimized result (primary criterion margin="<<margin<<"): " << std::endl << *sub << "Criterion value=" << critval_train << std::endl << "Sum of weights=" << -critval_test << std::endl; // (optionally) validate result by estimating kNN accuracy on selected feature sub-space on independent test data da->setSplittingDepth(0); cknn->train(da,sub); cknn->test(critval_test,da); std::cout << "Validated "<<cknn->get_k()<<"-NN accuracy=" << critval_test << std::endl << std::endl; } } catch(FST::fst_error &e) {std::cerr<<"FST ERROR: "<< e.what() << ", code=" << e.code() << std::endl;} catch(std::exception &e) {std::cerr<<"non-FST ERROR: "<< e.what() << std::endl;} return 0; }
/*! \section example21 Example 21: Generalized sequential feature subset search. All sequential search algorithms (SFS, SFFS, OS, DOS, SFRS) can be extended to operate in "generalized" setting (term coined in Devijver, Kittler book). In each step of a generalized sequential search algorithm not only one best feature is added to current subset nor one worst feature is removed from current subset; instead, g-tuples of features are considered. Searching for such group of g features that improves the current subset the most when added (or such that degrades the current subset the least when removed) is more computationally complex but increases the chance of finding the optimum or a result closer to optimum (nevertheless, improvement is not guaranteed and in some cases the result can actually degrade). The value g is to be set by user; the higher the value g, the slower the search (time complexity increases exponentially with increasing g). Note that setting g equal to the number of all features would effectively emulate the operation of exhaustive search. In this example features are selected using the generalized (G)SFFS algorithm (G=2) and 3-NN wrapper classification accuracy as FS criterion. Classification accuracy (i.e, FS wrapper criterion value) is estimated on the first 50% of data samples by means of 3-fold cross-validation. The final classification performance on the selected subspace is eventually validated on the second 50% of data. (G)SFFS is called here in d-optimizing setting, invoked by parameter 0 in search(0,...), which is otherwise used to specify the required subset size. \note Note that in this context the term generalization does not! relate to classification performance on independent data. */ int main() { try{ typedef double RETURNTYPE; typedef double DATATYPE; typedef double REALTYPE; typedef unsigned int IDXTYPE; typedef unsigned int DIMTYPE; typedef short BINTYPE; typedef FST::Subset<BINTYPE, DIMTYPE> SUBSET; typedef FST::Data_Intervaller<std::vector<FST::Data_Interval<IDXTYPE> >,IDXTYPE> INTERVALLER; typedef boost::shared_ptr<FST::Data_Splitter<INTERVALLER,IDXTYPE> > PSPLITTER; typedef FST::Data_Splitter_CV<INTERVALLER,IDXTYPE> SPLITTERCV; typedef FST::Data_Splitter_5050<INTERVALLER,IDXTYPE> SPLITTER5050; typedef FST::Data_Accessor_Splitting_MemTRN<DATATYPE,IDXTYPE,INTERVALLER> DATAACCESSOR; // uncomment for TRN data format //typedef FST::Data_Accessor_Splitting_MemARFF<DATATYPE,IDXTYPE,INTERVALLER> DATAACCESSOR; // uncomment for ARFF data format typedef FST::Distance_Euclid<DATATYPE,DIMTYPE,SUBSET> DISTANCE; typedef FST::Classifier_kNN<RETURNTYPE,DATATYPE,IDXTYPE,DIMTYPE,SUBSET,DATAACCESSOR,DISTANCE> CLASSIFIERKNN; typedef FST::Criterion_Wrapper<RETURNTYPE,SUBSET,CLASSIFIERKNN,DATAACCESSOR> WRAPPERKNN; typedef FST::Sequential_Step_Straight<RETURNTYPE,DIMTYPE,SUBSET,WRAPPERKNN> EVALUATOR; std::cout << "Starting Example 21: Generalized sequential feature subset search..." << std::endl; // keep second half of data for independent testing of final classification performance PSPLITTER dsp_outer(new SPLITTER5050()); // in the course of search use the first half of data by 3-fold cross-validation in wrapper FS criterion evaluation PSPLITTER dsp_inner(new SPLITTERCV(3)); // do not scale data boost::shared_ptr<FST::Data_Scaler<DATATYPE> > dsc(new FST::Data_Scaler_void<DATATYPE>()); // set-up data access boost::shared_ptr<std::vector<PSPLITTER> > splitters(new std::vector<PSPLITTER>); splitters->push_back(dsp_outer); splitters->push_back(dsp_inner); boost::shared_ptr<DATAACCESSOR> da(new DATAACCESSOR("data/speech_15.trn",splitters,dsc)); da->initialize(); // initiate access to split data parts da->setSplittingDepth(0); if(!da->getFirstSplit()) throw FST::fst_error("50/50 data split failed."); da->setSplittingDepth(1); if(!da->getFirstSplit()) throw FST::fst_error("3-fold cross-validation failure."); // initiate the storage for subset to-be-selected boost::shared_ptr<SUBSET> sub(new SUBSET(da->getNoOfFeatures())); sub->deselect_all(); // set-up 3-Nearest Neighbor classifier based on Euclidean distances boost::shared_ptr<CLASSIFIERKNN> cknn(new CLASSIFIERKNN); cknn->set_k(5); // wrap the 3-NN classifier to enable its usage as FS criterion (criterion value will be estimated by 3-fold cross-val.) boost::shared_ptr<WRAPPERKNN> wknn(new WRAPPERKNN); wknn->initialize(cknn,da); // set-up the standard sequential search step object (option: hybrid, ensemble, etc.) boost::shared_ptr<EVALUATOR> eval(new EVALUATOR); // set-up Sequential Forward Floating Selection search procedure FST::Search_SFFS<RETURNTYPE,DIMTYPE,SUBSET,WRAPPERKNN,EVALUATOR> srch(eval); srch.set_search_direction(FST::FORWARD); // try FST::BACKWARD // set the size of feature groups to be evaluated for inclusion/removal in each sequential step (can be applied to SFS, SFFS, OS, DOS, SFRS) srch.set_generalization_level(2); // run the search std::cout << "Feature selection setup:" << std::endl << *da << std::endl << srch << std::endl << *wknn << std::endl << std::endl; RETURNTYPE critval_train, critval_test; srch.set_output_detail(FST::NORMAL); // set FST::SILENT to disable all text output in the course of search (FST::NORMAL is default) if(!srch.search(0,critval_train,sub,wknn,std::cout)) throw FST::fst_error("Search not finished."); // (optionally) validate result by estimating kNN accuracy on selected feature sub-space on independent test data da->setSplittingDepth(0); cknn->train(da,sub); cknn->test(critval_test,da); std::cout << "Validated "<<cknn->get_k()<<"-NN accuracy=" << critval_test << std::endl << std::endl; // (optionally) list the best known solutions for each cardinality as recorded throughout the course of search std::cout << "Best recorded solution for subset size:" << std::endl; for(DIMTYPE d=1;d<=sub->get_n();d++) if(srch.get_result(d,critval_train,sub)) std::cout << d << ": val="<< critval_train << ", "<<*sub << std::endl; } catch(FST::fst_error &e) {std::cerr<<"FST ERROR: "<< e.what() << ", code=" << e.code() << std::endl;} catch(std::exception &e) {std::cerr<<"non-FST ERROR: "<< e.what() << std::endl;} return 0; }