//select the base classifier with the highest accuracy on validation set CForwardSelect::CForwardSelect(const CEnsemble &UEnsemble,const CDataset &ValidatingSet) :CEnsemblePruner(UEnsemble) { Name=MyName; //Info int CaseNum=ValidatingSet.GetInfo().Height; int EnsembleSize=Ensemble.GetSize(); //start time for training clock_t start=clock(); //get prediction vector<CPrediction*> *Predictions=Ensemble.AllClassify(ValidatingSet); //initialize with no classifier selected for(int i=0;i<EnsembleSize;i++) Weights.push_back(0); //add classifier one by one double BestAccr=0; for(int i=0;i<EnsembleSize;i++) { //add the best in each round int Best=-1; for(int j=0;j<EnsembleSize;j++) { //skip the one has been selected if(Weights[j]>0)continue; //add this classifier temporarily Weights[j]=1; //predicting CPrediction *Prediction=Ensemble.Classify(ValidatingSet,*Predictions,Weights); double Accuracy=Prediction->GetAccuracy(); delete Prediction; //better accuracy? if(Accuracy>BestAccr) { Best=j; BestAccr=Accuracy; //if accuracy is 1.0, no better one can be found if(Accuracy>=1.0) break; } //recover to the initial state Weights[j]=0; } //if accuracy is 1.0, no better one can be found if(BestAccr>=1.0) break; //select the best one of this round if(Best!=-1) Weights[Best]=1; } for(int i=0;i<EnsembleSize;i++) delete ((*Predictions)[i]); delete Predictions; //time consumed CreatingTime = (double)(clock() - start) / CLOCKS_PER_SEC; }
CPMEP::CPMEP(const CEnsemble &UEnsemble,const CDataset &ValidatingSet,const vector<CPrediction*> &Predictions) :CEnsemblePruner(UEnsemble) { Name=MyName; if(Ensemble.GetSize()!=(int)Predictions.size()) throw(CError("CPMEP: wrong size for user-defined weights!",100,0)); //Info int CaseNum=ValidatingSet.GetInfo().Height; int EnsembleSize=Ensemble.GetSize(); //start time for training clock_t start=clock(); //table: instance-classifier-prediction vector<CaseClassArrayStr> CaseClassTab; BuildCaseClassTab(CaseClassTab,ValidatingSet,Predictions); // Dump("c.txt",CaseClassTab); //construct FP-tree TreeNodeStr Root; BuildFPTree(Root,CaseClassTab,EnsembleSize); vector<SelClassifierStr> SelClassifiers; //k: number of classifiers to be selected for(int k=1;k<=EnsembleSize/2*2+1;k+=2) { //path-table: paths with length of k/2+1 vector<TreePathStr> TreePathTable; TreePathStr TreePath; BuildPathTab(TreePathTable,Root,TreePath,k/2+1); // Dump("cc.txt",TreePathTable); //selected classifier (no more than k) SelClassifierStr S,TempS; S.Count=0; //add paths until path-table is empty while((int)TreePathTable.size()>0 && (int)S.Set.size()<k) { //sort all paths by Count value and number of classifiers sort(TreePathTable.begin(),TreePathTable.end(),ClassNumOrder); stable_sort(TreePathTable.begin(),TreePathTable.end(),CountDescOrder); // Dump("TreePathTable.txt",TreePathTable); //temporally select all classifier of the first path TempS=S; TempS.Count+=TreePathTable[0].Count; for(int j=0;j<(int)TreePathTable[0].Classifiers.size();j++) TempS.Set.insert(TreePathTable[0].Classifiers[j]); //total size if((int)TempS.Set.size()<=k) { S=TempS; //remove classifiers of selected path from all rows of path-table for(int jj=0;jj<(int)TreePathTable[0].Classifiers.size();jj++) { for(int i=1;i<(int)TreePathTable.size();i++) for(int j=0;j<(int)TreePathTable[i].Classifiers.size();j++) if(TreePathTable[i].Classifiers[j]==TreePathTable[0].Classifiers[jj]) { TreePathTable[i].Classifiers.erase(TreePathTable[i].Classifiers.begin()+j); break; } } // Dump("TreePathTable.txt",TreePathTable); //remove empty row from path-table for(int i=1;i<(int)TreePathTable.size();) { if(TreePathTable[i].Classifiers.size()<=0) { //the Count value of the path being removed is added TempS.Count+=TreePathTable[i].Count; TreePathTable.erase(TreePathTable.begin()+i); continue; } i++; } //this path is finished TreePathTable.erase(TreePathTable.begin()); // Dump("TreePathTable.txt",TreePathTable); //merge same paths for(int i=0;i<(int)TreePathTable.size();i++) { set<int> A0; for(int jj=0;jj<(int)TreePathTable[i].Classifiers.size();jj++) A0.insert(TreePathTable[i].Classifiers[jj]); for(int j=i+1;j<(int)TreePathTable.size();) { set<int> A1; for(int jj=0;jj<(int)TreePathTable[j].Classifiers.size();jj++) A1.insert(TreePathTable[j].Classifiers[jj]); if(A0==A1) { TreePathTable[i].Count+=TreePathTable[j].Count; TreePathTable.erase(TreePathTable.begin()+j); continue; } j++; } }//for i // Dump("TreePathTable.txt",TreePathTable); } else//adding this path will make the size of selected classifiers greater than k, so skip it TreePathTable.erase(TreePathTable.begin()); }//while SelClassifiers.push_back(S); }//for k // deltree(&&Root); //sort all sets by Count and size sort(SelClassifiers.begin(),SelClassifiers.end(),SelClassSetSizeOrder); stable_sort(SelClassifiers.begin(),SelClassifiers.end(),SelClassCountDescOrder); //set the weight of selected classifiers for(int i=0;i<EnsembleSize;i++) Weights.push_back(0); set<int>::iterator i_Classifier; for(i_Classifier=SelClassifiers[0].Set.begin();i_Classifier!=SelClassifiers[0].Set.end();i_Classifier++) { for(int i=0;i<EnsembleSize;i++) if(*i_Classifier==i) Weights[i]=1.0; } //time consumed CreatingTime = (double)(clock() - start) / CLOCKS_PER_SEC; }
//CaseClassTab: int CPMEP::BuildCaseClassTab(vector<CaseClassArrayStr> &CaseClassTab,const CDataset &ValidatingSet, const vector<CPrediction*> &Predictions) { //Info int CaseNum=ValidatingSet.GetInfo().Height; int EnsembleSize=Ensemble.GetSize(); if(Predictions[0]->GetCaseNum()!=CaseNum) { printf("DataSet->height!=BpnnResult->CaseNum"); return 1; } //construct and initialize the table CaseClassTab.clear(); { //each instance a row //row vector<CaseClassRecStr> CaseClassArray; //item: each classifier a column CaseClassRecStr CaseClassRec; CaseClassRec.Correct=0; // CaseClassRec.NodeLink=NULL; //total column: classifier number +1 //last column: number of classifiers that predict this instance correctly for(int k=0;k<=EnsembleSize;k++) { CaseClassRec.Classifier=k; CaseClassArray.push_back(CaseClassRec); } //total row=CaseNum+1 //Last row: number of instances predicted correctly by this classifier and id of it for(int j=0;j<=CaseNum;j++) CaseClassTab.push_back(CaseClassArray); } //fill it for(int i=0;i<EnsembleSize;i++) { // for(int j=0;j<CaseNum;j++) { //is this prediction correct? if(Predictions[i]->GetCorrectness()[j]) { if(CaseClassTab[j][i].Correct!=0) { printf("CaseClassTab[j][i].Correct!=0"); return 2; } CaseClassTab[j][i].Correct++; //last column: number of classifiers that predict this instance correctly CaseClassTab[j][EnsembleSize].Correct++; //last row: number of instances correctly predicted by this classifier CaseClassTab[CaseNum][i].Correct++; } } } //sort the columns of the last row by descent order of corresponding classifiers' prediction accuracy sort(CaseClassTab[CaseNum].begin(),CaseClassTab[CaseNum].end(),CorrectDescOrder); // Dump("a.txt",CaseClassTab); //sort columns of other rows as the order of the last row for(int i=0;i<EnsembleSize;i++) { //are the left classifiers incorrectly predict all instances? if(CaseClassTab[CaseNum][i].Correct==0)break; //find each column's new position int k; for(k=i;k<EnsembleSize;k++) if(CaseClassTab[0][k].Classifier==CaseClassTab[CaseNum][i].Classifier) break; //don't need to change position? if(k==i) continue; //switch to new position(k -> i) CaseClassRecStr TempCaseClassRec; for(int j=0;j<CaseNum;j++) { TempCaseClassRec=CaseClassTab[j][i]; CaseClassTab[j][i]=CaseClassTab[j][k]; CaseClassTab[j][k]=TempCaseClassRec; } } // Dump("a.txt",CaseClassTab); return 0; }
CNaiveBayes::CNaiveBayes(const CDataset &TrainSet,int USplitNum) { Name=MyName; SplitNum=USplitNum; //start time for training clock_t start=clock(); //data const MATRIX &OrgData=TrainSet.GetData(); const CASE_INFO &OrgInfo=TrainSet.GetInfo(); //initialize all data structure for(int i=0;i<OrgInfo.ValidWidth-1;i++) { //each attribute EstimatorStr Estim; Estim.AttType=OrgInfo.ValidAttrs[i].AttType; if(Estim.AttType==ATT_DISCRETE) { //Laplace estimator Estim.DiscEst.Count=1; int ValNum=(int)OrgInfo.ValidAttrs[i].Disc.size(); for(int j=0;j<ValNum;j++) Estim.DiscEst.AttrCount.push_back(1.0/ValNum); } //continuous attribute else { //Laplace estimator Estim.ContEst.Count=SplitNum; Estim.ContEst.Max=OrgInfo.ValidAttrs[i].Max; Estim.ContEst.Min=OrgInfo.ValidAttrs[i].Min; for(int j=0;j<SplitNum;j++) Estim.ContEst.Vals.push_back(1); } //for each attribute: all class label vector<EstimatorStr> EstiAttr; for(int j=0;j<OrgInfo.ClassNum;j++) EstiAttr.push_back(Estim); //all attributes Estims.push_back(EstiAttr); } //statistics for(int i=0;i<OrgInfo.Height;i++) { int Class=OrgData[i][OrgInfo.ValidWidth-1].Discr; for(int j=0;j<OrgInfo.ValidWidth-1;j++) switch(OrgInfo.ValidAttrs[j].AttType) { case ATT_DISCRETE: { int Val=OrgData[i][j].Discr; Estims[j][Class].DiscEst.Count++; //j: attribute, Class: label, Val: value of attribute Estims[j][Class].DiscEst.AttrCount[Val]++; } break; case ATT_CONTINUOUS: case ATT_DATETIME: { double Val=OrgData[i][j].Cont; int ValNo; if(OrgInfo.ValidAttrs[j].Max==OrgInfo.ValidAttrs[j].Min) ValNo=0; else ValNo=(int)((OrgData[i][j].Cont-OrgInfo.ValidAttrs[j].Min)*10/ (OrgInfo.ValidAttrs[j].Max-OrgInfo.ValidAttrs[j].Min)); if(ValNo>=SplitNum) ValNo=SplitNum-1; if(ValNo<0) ValNo=0; Estims[j][Class].ContEst.Vals[ValNo]++; Estims[j][Class].ContEst.Count++; } break; default: break; } }//for data //get all statistics needed for(int i=0;i<OrgInfo.ValidWidth-1;i++) { switch(OrgInfo.ValidAttrs[i].AttType) { case ATT_DISCRETE: for(int j=0;j<OrgInfo.ClassNum;j++) { int ValNum=(int)OrgInfo.ValidAttrs[i].Disc.size(); for(int k=0;k<ValNum;k++) Estims[i][j].DiscEst.AttrCount[k]/=Estims[i][j].DiscEst.Count; } break; case ATT_CONTINUOUS: case ATT_DATETIME: for(int j=0;j<OrgInfo.ClassNum;j++) { for(int k=0;k<SplitNum;k++) Estims[i][j].ContEst.Vals[k]/=Estims[i][j].ContEst.Count; } break; default: break; }//switch }//for attr //time consumed CreatingTime = (double)(clock() - start) / CLOCKS_PER_SEC; }
void CNaiveBayes::Train(const CDataset &TrainSet) { //start time for training clock_t start=clock(); //data const MATRIX &OrgData=TrainSet.GetData(); const CASE_INFO &OrgInfo=TrainSet.GetInfo(); //if range of a continuous attribute changed (extended), should we re-calculate all existed statistics? //we can't, some information has lost. We can only extend the first and the last intervals //statistics for(int i=0;i<OrgInfo.Height;i++) { //label of instance int Class=OrgData[i][OrgInfo.ValidWidth-1].Discr; //each attribute for(int j=0;j<OrgInfo.ValidWidth-1;j++) switch(OrgInfo.ValidAttrs[j].AttType) { case ATT_DISCRETE: { //value of this attribute int Val=OrgData[i][j].Discr; Estims[j][Class].DiscEst.Count++; //j: attribute, Class: label, Val: value of attribute Estims[j][Class].DiscEst.AttrCount[Val]++; } break; case ATT_CONTINUOUS: case ATT_DATETIME: { double Val=OrgData[i][j].Cont; int ValNo; if(OrgInfo.ValidAttrs[j].Max==OrgInfo.ValidAttrs[j].Min) ValNo=0; else ValNo=(int)((OrgData[i][j].Cont-Estims[j][Class].ContEst.Min)*10/ (Estims[j][Class].ContEst.Max-Estims[j][Class].ContEst.Min)); if(ValNo>=SplitNum) ValNo=SplitNum-1; if(ValNo<0) ValNo=0; Estims[j][Class].ContEst.Vals[ValNo]++; Estims[j][Class].ContEst.Count++; } break; default: break; }//case: attribute type }//for data //calculate all other statistics for(int i=0;i<OrgInfo.ValidWidth-1;i++) { switch(OrgInfo.ValidAttrs[i].AttType) { case ATT_DISCRETE: for(int j=0;j<OrgInfo.ClassNum;j++) { int ValNum=(int)OrgInfo.ValidAttrs[i].Disc.size(); for(int k=0;k<ValNum;k++) Estims[i][j].DiscEst.AttrCount[k]/=Estims[i][j].DiscEst.Count; } break; case ATT_CONTINUOUS: case ATT_DATETIME: for(int j=0;j<OrgInfo.ClassNum;j++) { for(int k=0;k<SplitNum;k++) Estims[i][j].ContEst.Vals[k]/=Estims[i][j].ContEst.Count; } break; default: break; }//switch }//for attributes //time consumed CreatingTime+=((double)(clock() - start) / CLOCKS_PER_SEC); }