double CAdaBoost::Deviance(const CDataset& kData, const Bag& kBag, const double* kFuncEstimate) { double loss = 0.0; double weight = 0.0; // Switch to validation set if necessary unsigned long num_of_rows_in_set = kData.get_size_of_set(); #pragma omp parallel for schedule(static, get_array_chunk_size()) \ reduction(+ : loss, weight) num_threads(get_num_threads()) for (unsigned long i = 0; i < num_of_rows_in_set; i++) { loss += kData.weight_ptr()[i] * std::exp(-(2 * kData.y_ptr()[i] - 1) * (kData.offset_ptr()[i] + kFuncEstimate[i])); weight += kData.weight_ptr()[i]; } // TODO: Check if weights are all zero for validation set if ((weight == 0.0) && (loss == 0.0)) { return nan(""); } else if (weight == 0.0) { return HUGE_VAL; } return loss / weight; }
void CAdaBoost::ComputeWorkingResponse(const CDataset& kData, const Bag& kBag, const double* kFuncEstimate, std::vector<double>& residuals) { #pragma omp parallel for schedule(static, get_array_chunk_size()) \ num_threads(get_num_threads()) for (unsigned long i = 0; i < kData.get_trainsize(); i++) { residuals[i] = -(2 * kData.y_ptr()[i] - 1) * std::exp(-(2 * kData.y_ptr()[i] - 1) * (kData.offset_ptr()[i] + kFuncEstimate[i])); } }
void CPoisson::ComputeWorkingResponse(const CDataset& kData, const Bag& kBag, const double* kFuncEstimate, std::vector<double>& residuals) { // compute working response #pragma omp parallel for schedule(static, get_array_chunk_size()) \ num_threads(get_num_threads()) for (unsigned long i = 0; i < kData.get_trainsize(); i++) { const double delta_func_est = kFuncEstimate[i] + kData.offset_ptr()[i]; residuals[i] = kData.y_ptr()[i] - std::exp(delta_func_est); } }
void CSVM::PrepareData(const CDataset &OrgSet,struct svm_problem &DataDesc) { //for SVM, we need to expand all multivalued discrete attributes of the training data into multi continuous attributes. //expand discrete attribute const CDataset *TrainSet=&OrgSet; if(!OrgSet.AllContinuous()) TrainSet=OrgSet.ExpandDiscrete(); const MATRIX &TrainData=TrainSet->GetData(); const CASE_INFO &CaseInfo=TrainSet->GetInfo(); //number of attribute for data set AttributeNum=CaseInfo.ValidWidth-1; //instances is formated as libsvm's requirements //number of instances DataDesc.l=CaseInfo.Height; //labels of instances DataDesc.y=new double[DataDesc.l]; //content of instances (all attributes plus a tag for end of line, each node is initialized as end of a row) struct svm_node Val={-1,0}; fill_d2(struct svm_node,DataDesc.x,CaseInfo.Height,CaseInfo.ValidWidth,Val); for(int i=0;i<CaseInfo.Height;i++) { DataDesc.y[i]=(double)TrainData[i][CaseInfo.ValidWidth-1].Discr; int ValidValue=0; for(int j=0;j<CaseInfo.ValidWidth-1;j++) { if(CaseInfo.ValidAttrs[j].AttType==ATT_DISCRETE) { throw(CError("SVM: discrete attribute should have been expanded!\n",100,0)); } else//range expanding { if(TrainData[i][j].Cont==0) continue; else if(CaseInfo.ValidAttrs[j].Max==CaseInfo.ValidAttrs[j].Min) continue; else { DataDesc.x[i][ValidValue].index=j+1; DataDesc.x[i][ValidValue].value=(TrainData[i][j].Cont-CaseInfo.ValidAttrs[j].Min)/ (CaseInfo.ValidAttrs[j].Max-CaseInfo.ValidAttrs[j].Min); ValidValue++; } } } //tag for end of line has been set } if(!OrgSet.AllContinuous()) delete TrainSet; return; }
double CPoisson::InitF(const CDataset& kData) { double sum = 0.0; double denom = 0.0; #pragma omp parallel for schedule(static, get_array_chunk_size()) \ reduction(+ : sum, denom) num_threads(get_num_threads()) for (unsigned long i = 0; i < kData.get_trainsize(); i++) { sum += kData.weight_ptr()[i] * kData.y_ptr()[i]; denom += kData.weight_ptr()[i] * std::exp(kData.offset_ptr()[i]); } return std::log(sum / denom); }
void CGaussian::ComputeWorkingResponse(const CDataset& kData, const Bag& kBag, const double* kFuncEstimate, std::vector<double>& residuals) { if (!(kData.y_ptr() && kFuncEstimate && kData.weight_ptr())) { throw gbm_exception::InvalidArgument(); } #pragma omp parallel for schedule(static, get_array_chunk_size()) \ num_threads(get_num_threads()) for (unsigned long i = 0; i < kData.get_trainsize(); i++) { residuals[i] = kData.y_ptr()[i] - kData.offset_ptr()[i] - kFuncEstimate[i]; } }
double CGaussian::InitF(const CDataset& kData) { double sum = 0.0; double totalweight = 0.0; // compute the mean #pragma omp parallel for schedule(static, get_array_chunk_size()) \ reduction(+ : sum, totalweight) num_threads(get_num_threads()) for (unsigned long i = 0; i < kData.get_trainsize(); i++) { sum += kData.weight_ptr()[i] * (kData.y_ptr()[i] - kData.offset_ptr()[i]); totalweight += kData.weight_ptr()[i]; } return sum / totalweight; }
//select the base classifier with the highest accuracy on validation set CForwardSelect::CForwardSelect(const CEnsemble &UEnsemble,const CDataset &ValidatingSet) :CEnsemblePruner(UEnsemble) { Name=MyName; //Info int CaseNum=ValidatingSet.GetInfo().Height; int EnsembleSize=Ensemble.GetSize(); //start time for training clock_t start=clock(); //get prediction vector<CPrediction*> *Predictions=Ensemble.AllClassify(ValidatingSet); //initialize with no classifier selected for(int i=0;i<EnsembleSize;i++) Weights.push_back(0); //add classifier one by one double BestAccr=0; for(int i=0;i<EnsembleSize;i++) { //add the best in each round int Best=-1; for(int j=0;j<EnsembleSize;j++) { //skip the one has been selected if(Weights[j]>0)continue; //add this classifier temporarily Weights[j]=1; //predicting CPrediction *Prediction=Ensemble.Classify(ValidatingSet,*Predictions,Weights); double Accuracy=Prediction->GetAccuracy(); delete Prediction; //better accuracy? if(Accuracy>BestAccr) { Best=j; BestAccr=Accuracy; //if accuracy is 1.0, no better one can be found if(Accuracy>=1.0) break; } //recover to the initial state Weights[j]=0; } //if accuracy is 1.0, no better one can be found if(BestAccr>=1.0) break; //select the best one of this round if(Best!=-1) Weights[Best]=1; } for(int i=0;i<EnsembleSize;i++) delete ((*Predictions)[i]); delete Predictions; //time consumed CreatingTime = (double)(clock() - start) / CLOCKS_PER_SEC; }
void CPoisson::FitBestConstant(const CDataset& kData, const Bag& kBag, const double* kFuncEstimate, unsigned long num_terminalnodes, std::vector<double>& residuals, CCARTTree& tree) { unsigned long obs_num = 0; unsigned long node_num = 0; vector<double> numerator_vec(num_terminalnodes, 0.0); vector<double> denominator_vec(num_terminalnodes, 0.0); vector<double> max_vec(num_terminalnodes, -HUGE_VAL); vector<double> min_vec(num_terminalnodes, HUGE_VAL); for (obs_num = 0; obs_num < kData.get_trainsize(); obs_num++) { if (kBag.get_element(obs_num)) { numerator_vec[tree.get_node_assignments()[obs_num]] += kData.weight_ptr()[obs_num] * kData.y_ptr()[obs_num]; denominator_vec[tree.get_node_assignments()[obs_num]] += kData.weight_ptr()[obs_num] * std::exp(kData.offset_ptr()[obs_num] + kFuncEstimate[obs_num]); } } for (node_num = 0; node_num < num_terminalnodes; node_num++) { if (tree.has_node(node_num)) { if (numerator_vec[node_num] == 0.0) { // DEBUG: if vecdNum==0 then prediction = -Inf // Not sure what else to do except plug in an arbitrary // negative number, -1? -10? Let's use -1, then make // sure |adF| < 19 always. tree.get_terminal_nodes()[node_num]->set_prediction(-19.0); } else if (denominator_vec[node_num] == 0.0) { tree.get_terminal_nodes()[node_num]->set_prediction(0.0); } else { tree.get_terminal_nodes()[node_num]->set_prediction( std::log(numerator_vec[node_num] / denominator_vec[node_num])); } tree.get_terminal_nodes()[node_num]->set_prediction( R::fmin2(tree.get_terminal_nodes()[node_num]->get_prediction(), 19 - max_vec[node_num])); tree.get_terminal_nodes()[node_num]->set_prediction( R::fmax2(tree.get_terminal_nodes()[node_num]->get_prediction(), -19 - min_vec[node_num])); } } }
double CGaussian::BagImprovement(const CDataset& kData, const Bag& kBag, const double* kFuncEstimate, const double kShrinkage, const std::vector<double>& kDeltaEstimate) { double returnvalue = 0.0; double weight = 0.0; #pragma omp parallel for schedule(static, get_array_chunk_size()) \ reduction(+ : returnvalue, weight) num_threads(get_num_threads()) for (unsigned long i = 0; i < kData.get_trainsize(); i++) { if (!kBag.get_element(i)) { const double deltafunc_est = kFuncEstimate[i] + kData.offset_ptr()[i]; returnvalue += kData.weight_ptr()[i] * kShrinkage * kDeltaEstimate[i] * (2.0 * (kData.y_ptr()[i] - deltafunc_est) - kShrinkage * kDeltaEstimate[i]); weight += kData.weight_ptr()[i]; } } return returnvalue / weight; }
double CGaussian::Deviance(const CDataset& kData, const Bag& kBag, const double* kFuncEstimate) { double loss = 0.0; double weight = 0.0; unsigned long num_rows_in_set = kData.get_size_of_set(); #pragma omp parallel for schedule(static, get_array_chunk_size()) \ reduction(+ : loss, weight) num_threads(get_num_threads()) for (unsigned long i = 0; i < num_rows_in_set; i++) { const double tmp = (kData.y_ptr()[i] - kData.offset_ptr()[i] - kFuncEstimate[i]); loss += kData.weight_ptr()[i] * tmp * tmp; weight += kData.weight_ptr()[i]; } // TODO: Check if weights are all zero for validation set if ((weight == 0.0) && (loss == 0.0)) { return nan(""); } else if (weight == 0.0) { return copysign(HUGE_VAL, loss); } return loss / weight; }
double CAdaBoost::InitF(const CDataset& kData) { double numerator = 0.0; double denominator = 0.0; #pragma omp parallel for schedule(static, get_array_chunk_size()) \ reduction(+ : numerator, denominator) num_threads(get_num_threads()) for (unsigned long i = 0; i < kData.get_trainsize(); i++) { if (kData.y_ptr()[i] == 1.0) { numerator += kData.weight_ptr()[i] * std::exp(-kData.offset_ptr()[i]); } else { denominator += kData.weight_ptr()[i] * std::exp(kData.offset_ptr()[i]); } } return 0.5 * std::log(numerator / denominator); }
void CAdaBoost::FitBestConstant(const CDataset& kData, const Bag& kBag, const double* kFuncEstimate, unsigned long num_terminalnodes, std::vector<double>& residuals, CCARTTree& tree) { unsigned long obs_num = 0; unsigned long node_num = 0; numerator_bestconstant_.resize(num_terminalnodes); numerator_bestconstant_.assign(numerator_bestconstant_.size(), 0.0); denominator_bestconstant_.resize(num_terminalnodes); denominator_bestconstant_.assign(denominator_bestconstant_.size(), 0.0); for (obs_num = 0; obs_num < kData.get_trainsize(); obs_num++) { if (kBag.get_element(obs_num)) { const double deltafunc_est = kFuncEstimate[obs_num] + kData.offset_ptr()[obs_num]; numerator_bestconstant_[tree.get_node_assignments()[obs_num]] += kData.weight_ptr()[obs_num] * (2 * kData.y_ptr()[obs_num] - 1) * std::exp(-(2 * kData.y_ptr()[obs_num] - 1) * deltafunc_est); denominator_bestconstant_[tree.get_node_assignments()[obs_num]] += kData.weight_ptr()[obs_num] * std::exp(-(2 * kData.y_ptr()[obs_num] - 1) * deltafunc_est); } } for (node_num = 0; node_num < num_terminalnodes; node_num++) { if (tree.has_node(node_num)) { if (denominator_bestconstant_[node_num] == 0) { tree.get_terminal_nodes()[node_num]->set_prediction(0.0); } else { tree.get_terminal_nodes()[node_num]->set_prediction( numerator_bestconstant_[node_num] / denominator_bestconstant_[node_num]); } } } }
void CNaiveBayes::Train(const CDataset &TrainSet) { //start time for training clock_t start=clock(); //data const MATRIX &OrgData=TrainSet.GetData(); const CASE_INFO &OrgInfo=TrainSet.GetInfo(); //if range of a continuous attribute changed (extended), should we re-calculate all existed statistics? //we can't, some information has lost. We can only extend the first and the last intervals //statistics for(int i=0;i<OrgInfo.Height;i++) { //label of instance int Class=OrgData[i][OrgInfo.ValidWidth-1].Discr; //each attribute for(int j=0;j<OrgInfo.ValidWidth-1;j++) switch(OrgInfo.ValidAttrs[j].AttType) { case ATT_DISCRETE: { //value of this attribute int Val=OrgData[i][j].Discr; Estims[j][Class].DiscEst.Count++; //j: attribute, Class: label, Val: value of attribute Estims[j][Class].DiscEst.AttrCount[Val]++; } break; case ATT_CONTINUOUS: case ATT_DATETIME: { double Val=OrgData[i][j].Cont; int ValNo; if(OrgInfo.ValidAttrs[j].Max==OrgInfo.ValidAttrs[j].Min) ValNo=0; else ValNo=(int)((OrgData[i][j].Cont-Estims[j][Class].ContEst.Min)*10/ (Estims[j][Class].ContEst.Max-Estims[j][Class].ContEst.Min)); if(ValNo>=SplitNum) ValNo=SplitNum-1; if(ValNo<0) ValNo=0; Estims[j][Class].ContEst.Vals[ValNo]++; Estims[j][Class].ContEst.Count++; } break; default: break; }//case: attribute type }//for data //calculate all other statistics for(int i=0;i<OrgInfo.ValidWidth-1;i++) { switch(OrgInfo.ValidAttrs[i].AttType) { case ATT_DISCRETE: for(int j=0;j<OrgInfo.ClassNum;j++) { int ValNum=(int)OrgInfo.ValidAttrs[i].Disc.size(); for(int k=0;k<ValNum;k++) Estims[i][j].DiscEst.AttrCount[k]/=Estims[i][j].DiscEst.Count; } break; case ATT_CONTINUOUS: case ATT_DATETIME: for(int j=0;j<OrgInfo.ClassNum;j++) { for(int k=0;k<SplitNum;k++) Estims[i][j].ContEst.Vals[k]/=Estims[i][j].ContEst.Count; } break; default: break; }//switch }//for attributes //time consumed CreatingTime+=((double)(clock() - start) / CLOCKS_PER_SEC); }
std::auto_ptr<CDistribution> gbm_setup ( const CDataset& data, const std::string& family, int cTrees, int cDepth, int cMinObsInNode, int cNumClasses, double dShrinkage, double dBagFraction, int cTrain, int cFeatures, int& cGroups ) { std::auto_ptr<CDistribution> pDist; cGroups = -1; // set the distribution if (family == "gamma") { pDist.reset(new CGamma()); } else if (family == "tweedie") { pDist.reset(new CTweedie(data.misc_ptr()[0])); } else if (family == "bernoulli") { pDist.reset(new CBernoulli()); } else if (family == "gaussian") { pDist.reset(new CGaussian()); } else if (family == "poisson") { pDist.reset(new CPoisson()); } else if (family == "adaboost") { pDist.reset(new CAdaBoost()); } else if (family == "coxph") { pDist.reset(new CCoxPH()); } else if (family == "laplace") { pDist.reset(new CLaplace()); } else if (family == "quantile") { pDist.reset(new CQuantile(data.misc_ptr()[0])); } else if (family == "tdist") { pDist.reset(new CTDist(data.misc_ptr()[0])); } else if (family == "multinomial") { pDist.reset(new CMultinomial(cNumClasses, data.nrow())); } else if (family == "huberized") { pDist.reset(new CHuberized()); } else if (family == "pairwise_conc") { pDist.reset(new CPairwise("conc")); } else if (family == "pairwise_ndcg") { pDist.reset(new CPairwise("ndcg")); } else if (family == "pairwise_map") { pDist.reset(new CPairwise("map")); } else if (family == "pairwise_mrr") { pDist.reset(new CPairwise("mrr")); } else { throw GBM::invalid_argument(); } if (0==family.compare(0, 8, "pairwise")) { cGroups = num_groups(data.misc_ptr(), cTrain); } return pDist; }
//CaseClassTab: int CPMEP::BuildCaseClassTab(vector<CaseClassArrayStr> &CaseClassTab,const CDataset &ValidatingSet, const vector<CPrediction*> &Predictions) { //Info int CaseNum=ValidatingSet.GetInfo().Height; int EnsembleSize=Ensemble.GetSize(); if(Predictions[0]->GetCaseNum()!=CaseNum) { printf("DataSet->height!=BpnnResult->CaseNum"); return 1; } //construct and initialize the table CaseClassTab.clear(); { //each instance a row //row vector<CaseClassRecStr> CaseClassArray; //item: each classifier a column CaseClassRecStr CaseClassRec; CaseClassRec.Correct=0; // CaseClassRec.NodeLink=NULL; //total column: classifier number +1 //last column: number of classifiers that predict this instance correctly for(int k=0;k<=EnsembleSize;k++) { CaseClassRec.Classifier=k; CaseClassArray.push_back(CaseClassRec); } //total row=CaseNum+1 //Last row: number of instances predicted correctly by this classifier and id of it for(int j=0;j<=CaseNum;j++) CaseClassTab.push_back(CaseClassArray); } //fill it for(int i=0;i<EnsembleSize;i++) { // for(int j=0;j<CaseNum;j++) { //is this prediction correct? if(Predictions[i]->GetCorrectness()[j]) { if(CaseClassTab[j][i].Correct!=0) { printf("CaseClassTab[j][i].Correct!=0"); return 2; } CaseClassTab[j][i].Correct++; //last column: number of classifiers that predict this instance correctly CaseClassTab[j][EnsembleSize].Correct++; //last row: number of instances correctly predicted by this classifier CaseClassTab[CaseNum][i].Correct++; } } } //sort the columns of the last row by descent order of corresponding classifiers' prediction accuracy sort(CaseClassTab[CaseNum].begin(),CaseClassTab[CaseNum].end(),CorrectDescOrder); // Dump("a.txt",CaseClassTab); //sort columns of other rows as the order of the last row for(int i=0;i<EnsembleSize;i++) { //are the left classifiers incorrectly predict all instances? if(CaseClassTab[CaseNum][i].Correct==0)break; //find each column's new position int k; for(k=i;k<EnsembleSize;k++) if(CaseClassTab[0][k].Classifier==CaseClassTab[CaseNum][i].Classifier) break; //don't need to change position? if(k==i) continue; //switch to new position(k -> i) CaseClassRecStr TempCaseClassRec; for(int j=0;j<CaseNum;j++) { TempCaseClassRec=CaseClassTab[j][i]; CaseClassTab[j][i]=CaseClassTab[j][k]; CaseClassTab[j][k]=TempCaseClassRec; } } // Dump("a.txt",CaseClassTab); return 0; }
CNaiveBayes::CNaiveBayes(const CDataset &TrainSet,int USplitNum) { Name=MyName; SplitNum=USplitNum; //start time for training clock_t start=clock(); //data const MATRIX &OrgData=TrainSet.GetData(); const CASE_INFO &OrgInfo=TrainSet.GetInfo(); //initialize all data structure for(int i=0;i<OrgInfo.ValidWidth-1;i++) { //each attribute EstimatorStr Estim; Estim.AttType=OrgInfo.ValidAttrs[i].AttType; if(Estim.AttType==ATT_DISCRETE) { //Laplace estimator Estim.DiscEst.Count=1; int ValNum=(int)OrgInfo.ValidAttrs[i].Disc.size(); for(int j=0;j<ValNum;j++) Estim.DiscEst.AttrCount.push_back(1.0/ValNum); } //continuous attribute else { //Laplace estimator Estim.ContEst.Count=SplitNum; Estim.ContEst.Max=OrgInfo.ValidAttrs[i].Max; Estim.ContEst.Min=OrgInfo.ValidAttrs[i].Min; for(int j=0;j<SplitNum;j++) Estim.ContEst.Vals.push_back(1); } //for each attribute: all class label vector<EstimatorStr> EstiAttr; for(int j=0;j<OrgInfo.ClassNum;j++) EstiAttr.push_back(Estim); //all attributes Estims.push_back(EstiAttr); } //statistics for(int i=0;i<OrgInfo.Height;i++) { int Class=OrgData[i][OrgInfo.ValidWidth-1].Discr; for(int j=0;j<OrgInfo.ValidWidth-1;j++) switch(OrgInfo.ValidAttrs[j].AttType) { case ATT_DISCRETE: { int Val=OrgData[i][j].Discr; Estims[j][Class].DiscEst.Count++; //j: attribute, Class: label, Val: value of attribute Estims[j][Class].DiscEst.AttrCount[Val]++; } break; case ATT_CONTINUOUS: case ATT_DATETIME: { double Val=OrgData[i][j].Cont; int ValNo; if(OrgInfo.ValidAttrs[j].Max==OrgInfo.ValidAttrs[j].Min) ValNo=0; else ValNo=(int)((OrgData[i][j].Cont-OrgInfo.ValidAttrs[j].Min)*10/ (OrgInfo.ValidAttrs[j].Max-OrgInfo.ValidAttrs[j].Min)); if(ValNo>=SplitNum) ValNo=SplitNum-1; if(ValNo<0) ValNo=0; Estims[j][Class].ContEst.Vals[ValNo]++; Estims[j][Class].ContEst.Count++; } break; default: break; } }//for data //get all statistics needed for(int i=0;i<OrgInfo.ValidWidth-1;i++) { switch(OrgInfo.ValidAttrs[i].AttType) { case ATT_DISCRETE: for(int j=0;j<OrgInfo.ClassNum;j++) { int ValNum=(int)OrgInfo.ValidAttrs[i].Disc.size(); for(int k=0;k<ValNum;k++) Estims[i][j].DiscEst.AttrCount[k]/=Estims[i][j].DiscEst.Count; } break; case ATT_CONTINUOUS: case ATT_DATETIME: for(int j=0;j<OrgInfo.ClassNum;j++) { for(int k=0;k<SplitNum;k++) Estims[i][j].ContEst.Vals[k]/=Estims[i][j].ContEst.Count; } break; default: break; }//switch }//for attr //time consumed CreatingTime = (double)(clock() - start) / CLOCKS_PER_SEC; }
CPMEP::CPMEP(const CEnsemble &UEnsemble,const CDataset &ValidatingSet,const vector<CPrediction*> &Predictions) :CEnsemblePruner(UEnsemble) { Name=MyName; if(Ensemble.GetSize()!=(int)Predictions.size()) throw(CError("CPMEP: wrong size for user-defined weights!",100,0)); //Info int CaseNum=ValidatingSet.GetInfo().Height; int EnsembleSize=Ensemble.GetSize(); //start time for training clock_t start=clock(); //table: instance-classifier-prediction vector<CaseClassArrayStr> CaseClassTab; BuildCaseClassTab(CaseClassTab,ValidatingSet,Predictions); // Dump("c.txt",CaseClassTab); //construct FP-tree TreeNodeStr Root; BuildFPTree(Root,CaseClassTab,EnsembleSize); vector<SelClassifierStr> SelClassifiers; //k: number of classifiers to be selected for(int k=1;k<=EnsembleSize/2*2+1;k+=2) { //path-table: paths with length of k/2+1 vector<TreePathStr> TreePathTable; TreePathStr TreePath; BuildPathTab(TreePathTable,Root,TreePath,k/2+1); // Dump("cc.txt",TreePathTable); //selected classifier (no more than k) SelClassifierStr S,TempS; S.Count=0; //add paths until path-table is empty while((int)TreePathTable.size()>0 && (int)S.Set.size()<k) { //sort all paths by Count value and number of classifiers sort(TreePathTable.begin(),TreePathTable.end(),ClassNumOrder); stable_sort(TreePathTable.begin(),TreePathTable.end(),CountDescOrder); // Dump("TreePathTable.txt",TreePathTable); //temporally select all classifier of the first path TempS=S; TempS.Count+=TreePathTable[0].Count; for(int j=0;j<(int)TreePathTable[0].Classifiers.size();j++) TempS.Set.insert(TreePathTable[0].Classifiers[j]); //total size if((int)TempS.Set.size()<=k) { S=TempS; //remove classifiers of selected path from all rows of path-table for(int jj=0;jj<(int)TreePathTable[0].Classifiers.size();jj++) { for(int i=1;i<(int)TreePathTable.size();i++) for(int j=0;j<(int)TreePathTable[i].Classifiers.size();j++) if(TreePathTable[i].Classifiers[j]==TreePathTable[0].Classifiers[jj]) { TreePathTable[i].Classifiers.erase(TreePathTable[i].Classifiers.begin()+j); break; } } // Dump("TreePathTable.txt",TreePathTable); //remove empty row from path-table for(int i=1;i<(int)TreePathTable.size();) { if(TreePathTable[i].Classifiers.size()<=0) { //the Count value of the path being removed is added TempS.Count+=TreePathTable[i].Count; TreePathTable.erase(TreePathTable.begin()+i); continue; } i++; } //this path is finished TreePathTable.erase(TreePathTable.begin()); // Dump("TreePathTable.txt",TreePathTable); //merge same paths for(int i=0;i<(int)TreePathTable.size();i++) { set<int> A0; for(int jj=0;jj<(int)TreePathTable[i].Classifiers.size();jj++) A0.insert(TreePathTable[i].Classifiers[jj]); for(int j=i+1;j<(int)TreePathTable.size();) { set<int> A1; for(int jj=0;jj<(int)TreePathTable[j].Classifiers.size();jj++) A1.insert(TreePathTable[j].Classifiers[jj]); if(A0==A1) { TreePathTable[i].Count+=TreePathTable[j].Count; TreePathTable.erase(TreePathTable.begin()+j); continue; } j++; } }//for i // Dump("TreePathTable.txt",TreePathTable); } else//adding this path will make the size of selected classifiers greater than k, so skip it TreePathTable.erase(TreePathTable.begin()); }//while SelClassifiers.push_back(S); }//for k // deltree(&&Root); //sort all sets by Count and size sort(SelClassifiers.begin(),SelClassifiers.end(),SelClassSetSizeOrder); stable_sort(SelClassifiers.begin(),SelClassifiers.end(),SelClassCountDescOrder); //set the weight of selected classifiers for(int i=0;i<EnsembleSize;i++) Weights.push_back(0); set<int>::iterator i_Classifier; for(i_Classifier=SelClassifiers[0].Set.begin();i_Classifier!=SelClassifiers[0].Set.end();i_Classifier++) { for(int i=0;i<EnsembleSize;i++) if(*i_Classifier==i) Weights[i]=1.0; } //time consumed CreatingTime = (double)(clock() - start) / CLOCKS_PER_SEC; }