Esempio n. 1
0
//select the base classifier with the highest accuracy on validation set
CForwardSelect::CForwardSelect(const CEnsemble &UEnsemble,const CDataset &ValidatingSet)
:CEnsemblePruner(UEnsemble)
{
	Name=MyName;
	//Info
	int CaseNum=ValidatingSet.GetInfo().Height;
	int EnsembleSize=Ensemble.GetSize();


	//start time for training
	clock_t start=clock();

	//get prediction
	vector<CPrediction*> *Predictions=Ensemble.AllClassify(ValidatingSet);

	//initialize with no classifier selected
	for(int i=0;i<EnsembleSize;i++)
		Weights.push_back(0);
	//add classifier one by one
	double BestAccr=0;
	for(int i=0;i<EnsembleSize;i++)
	{
		//add the best in each round
		int Best=-1;
		for(int j=0;j<EnsembleSize;j++)
		{
			//skip the one has been selected
			if(Weights[j]>0)continue;
			//add this classifier temporarily
			Weights[j]=1;
			//predicting
			CPrediction *Prediction=Ensemble.Classify(ValidatingSet,*Predictions,Weights);
			double Accuracy=Prediction->GetAccuracy();
			delete Prediction;
			//better accuracy?
			if(Accuracy>BestAccr)
			{
				Best=j;
				BestAccr=Accuracy;
				//if accuracy is 1.0, no better one can be found
				if(Accuracy>=1.0)
					break;
			}
			//recover to the initial state
			Weights[j]=0;
		}
		//if accuracy is 1.0, no better one can be found
		if(BestAccr>=1.0)
			break;
		//select the best one of this round
		if(Best!=-1)
			Weights[Best]=1;
	}

	for(int i=0;i<EnsembleSize;i++)
		delete ((*Predictions)[i]);
	delete Predictions;
	//time consumed
	CreatingTime = (double)(clock() - start) / CLOCKS_PER_SEC;
}
Esempio n. 2
0
CPMEP::CPMEP(const CEnsemble &UEnsemble,const CDataset &ValidatingSet,const vector<CPrediction*> &Predictions)
:CEnsemblePruner(UEnsemble)
{
	Name=MyName;
	if(Ensemble.GetSize()!=(int)Predictions.size())
		throw(CError("CPMEP: wrong size for user-defined weights!",100,0));
	//Info
	int CaseNum=ValidatingSet.GetInfo().Height;
	int EnsembleSize=Ensemble.GetSize();
	//start time for training
	clock_t start=clock();

	//table: instance-classifier-prediction
	vector<CaseClassArrayStr> CaseClassTab;
	BuildCaseClassTab(CaseClassTab,ValidatingSet,Predictions);
//	Dump("c.txt",CaseClassTab);
	//construct FP-tree
	TreeNodeStr Root;
	BuildFPTree(Root,CaseClassTab,EnsembleSize);
	
	vector<SelClassifierStr> SelClassifiers;
	//k: number of classifiers to be selected
	for(int k=1;k<=EnsembleSize/2*2+1;k+=2)
	{
		//path-table: paths with length of k/2+1
		vector<TreePathStr> TreePathTable;
		TreePathStr TreePath;
		BuildPathTab(TreePathTable,Root,TreePath,k/2+1);
//		Dump("cc.txt",TreePathTable);

		//selected classifier (no more than k)
		SelClassifierStr S,TempS;
		S.Count=0;
		//add paths until path-table is empty
		while((int)TreePathTable.size()>0 && (int)S.Set.size()<k)
		{
			//sort all paths by Count value and number of classifiers
			sort(TreePathTable.begin(),TreePathTable.end(),ClassNumOrder);
			stable_sort(TreePathTable.begin(),TreePathTable.end(),CountDescOrder);
//			Dump("TreePathTable.txt",TreePathTable);

			//temporally select all classifier of the first path
			TempS=S;
			TempS.Count+=TreePathTable[0].Count;
			for(int j=0;j<(int)TreePathTable[0].Classifiers.size();j++)
				TempS.Set.insert(TreePathTable[0].Classifiers[j]);

			//total size
			if((int)TempS.Set.size()<=k)
			{
				S=TempS;

				//remove classifiers of selected path from all rows of path-table
				for(int jj=0;jj<(int)TreePathTable[0].Classifiers.size();jj++)
				{
					for(int i=1;i<(int)TreePathTable.size();i++)
						for(int j=0;j<(int)TreePathTable[i].Classifiers.size();j++)
							if(TreePathTable[i].Classifiers[j]==TreePathTable[0].Classifiers[jj])
							{
								TreePathTable[i].Classifiers.erase(TreePathTable[i].Classifiers.begin()+j);
								break;
							}
				}
//				Dump("TreePathTable.txt",TreePathTable);

				//remove empty row from path-table
				for(int i=1;i<(int)TreePathTable.size();)
				{
					if(TreePathTable[i].Classifiers.size()<=0)
					{
						//the Count value of the path being removed is added
						TempS.Count+=TreePathTable[i].Count;

						TreePathTable.erase(TreePathTable.begin()+i);
						continue;
					}
					i++;
				}
				//this path is finished
				TreePathTable.erase(TreePathTable.begin());
//				Dump("TreePathTable.txt",TreePathTable);

				//merge same paths
				for(int i=0;i<(int)TreePathTable.size();i++)
				{
					set<int> A0;
					for(int jj=0;jj<(int)TreePathTable[i].Classifiers.size();jj++)
						A0.insert(TreePathTable[i].Classifiers[jj]);

					for(int j=i+1;j<(int)TreePathTable.size();)
					{
						set<int> A1;
						for(int jj=0;jj<(int)TreePathTable[j].Classifiers.size();jj++)
							A1.insert(TreePathTable[j].Classifiers[jj]);
						if(A0==A1)
						{
							TreePathTable[i].Count+=TreePathTable[j].Count;
							TreePathTable.erase(TreePathTable.begin()+j);
							continue;
						}
						j++;
					}
				}//for i
//				Dump("TreePathTable.txt",TreePathTable);
			}
			else//adding this path will make the size of selected classifiers greater than k, so skip it
				TreePathTable.erase(TreePathTable.begin());
		}//while
		SelClassifiers.push_back(S);

	}//for k
//	deltree(&&Root);

	//sort all sets by Count and size
	sort(SelClassifiers.begin(),SelClassifiers.end(),SelClassSetSizeOrder);
	stable_sort(SelClassifiers.begin(),SelClassifiers.end(),SelClassCountDescOrder);

	//set the weight of selected classifiers
	for(int i=0;i<EnsembleSize;i++)
		Weights.push_back(0);
	set<int>::iterator	i_Classifier;
	for(i_Classifier=SelClassifiers[0].Set.begin();i_Classifier!=SelClassifiers[0].Set.end();i_Classifier++)
	{
		for(int i=0;i<EnsembleSize;i++)
			if(*i_Classifier==i)
				Weights[i]=1.0;
	}

	//time consumed
	CreatingTime = (double)(clock() - start) / CLOCKS_PER_SEC;
}
Esempio n. 3
0
//CaseClassTab: 
int CPMEP::BuildCaseClassTab(vector<CaseClassArrayStr> &CaseClassTab,const CDataset &ValidatingSet,
	const vector<CPrediction*> &Predictions)
{
	//Info
	int CaseNum=ValidatingSet.GetInfo().Height;
	int EnsembleSize=Ensemble.GetSize();
	if(Predictions[0]->GetCaseNum()!=CaseNum)
	{
		printf("DataSet->height!=BpnnResult->CaseNum");
		return 1;
	}

	//construct and initialize the table
	CaseClassTab.clear();
	{
		//each instance a row
		//row
		vector<CaseClassRecStr> CaseClassArray;
		//item: each classifier a column
		CaseClassRecStr CaseClassRec;
		CaseClassRec.Correct=0;
//		CaseClassRec.NodeLink=NULL;
		//total column: classifier number +1
		//last column: number of classifiers that predict this instance correctly
		for(int k=0;k<=EnsembleSize;k++)
		{
			CaseClassRec.Classifier=k;
			CaseClassArray.push_back(CaseClassRec);
		}

		//total row=CaseNum+1
		//Last row: number of instances predicted correctly by this classifier and id of it
		for(int j=0;j<=CaseNum;j++)
			CaseClassTab.push_back(CaseClassArray);
	}

	//fill it
	for(int i=0;i<EnsembleSize;i++)
	{
		//
		for(int j=0;j<CaseNum;j++)
		{
			//is this prediction correct?
			if(Predictions[i]->GetCorrectness()[j])
			{
				if(CaseClassTab[j][i].Correct!=0)
				{
					printf("CaseClassTab[j][i].Correct!=0");
					return 2;
				}
				CaseClassTab[j][i].Correct++;
				//last column: number of classifiers that predict this instance correctly
				CaseClassTab[j][EnsembleSize].Correct++;
				//last row: number of instances correctly predicted by this classifier
				CaseClassTab[CaseNum][i].Correct++;
			}
		}
	}

	//sort the columns of the last row by descent order of corresponding classifiers' prediction accuracy
	sort(CaseClassTab[CaseNum].begin(),CaseClassTab[CaseNum].end(),CorrectDescOrder);
//	Dump("a.txt",CaseClassTab);
	//sort columns of other rows as the order of the last row
	for(int i=0;i<EnsembleSize;i++)
	{
		//are the left classifiers incorrectly predict all instances?
		if(CaseClassTab[CaseNum][i].Correct==0)break;
		//find each column's new position
		int k;
		for(k=i;k<EnsembleSize;k++)
			if(CaseClassTab[0][k].Classifier==CaseClassTab[CaseNum][i].Classifier)
				break;
		//don't need to change position?
		if(k==i)
			continue;

		//switch to new position(k -> i)
		CaseClassRecStr TempCaseClassRec;
		for(int j=0;j<CaseNum;j++)
		{
			TempCaseClassRec=CaseClassTab[j][i];
			CaseClassTab[j][i]=CaseClassTab[j][k];
			CaseClassTab[j][k]=TempCaseClassRec;
		}
	}
//	Dump("a.txt",CaseClassTab);

	return 0;
}
Esempio n. 4
0
CNaiveBayes::CNaiveBayes(const CDataset &TrainSet,int USplitNum)
{
	Name=MyName;
	SplitNum=USplitNum;
	//start time for training
	clock_t start=clock();

	//data
	const MATRIX &OrgData=TrainSet.GetData();
	const CASE_INFO &OrgInfo=TrainSet.GetInfo();

	//initialize all data structure
	for(int i=0;i<OrgInfo.ValidWidth-1;i++)
	{
		//each attribute
		EstimatorStr Estim;
		Estim.AttType=OrgInfo.ValidAttrs[i].AttType;
		if(Estim.AttType==ATT_DISCRETE)
		{
			//Laplace estimator
			Estim.DiscEst.Count=1;
			int ValNum=(int)OrgInfo.ValidAttrs[i].Disc.size();
			for(int j=0;j<ValNum;j++)
				Estim.DiscEst.AttrCount.push_back(1.0/ValNum);
		}
		//continuous attribute
		else
		{
			//Laplace estimator
			Estim.ContEst.Count=SplitNum;
			Estim.ContEst.Max=OrgInfo.ValidAttrs[i].Max;
			Estim.ContEst.Min=OrgInfo.ValidAttrs[i].Min;
			for(int j=0;j<SplitNum;j++)
				Estim.ContEst.Vals.push_back(1);
		}

		//for each attribute: all class label
		vector<EstimatorStr> EstiAttr;
		for(int j=0;j<OrgInfo.ClassNum;j++)
			EstiAttr.push_back(Estim);
		//all attributes
		Estims.push_back(EstiAttr);
	}

	//statistics
	for(int i=0;i<OrgInfo.Height;i++)
	{
		int Class=OrgData[i][OrgInfo.ValidWidth-1].Discr;
		for(int j=0;j<OrgInfo.ValidWidth-1;j++)
			switch(OrgInfo.ValidAttrs[j].AttType)
			{
				case ATT_DISCRETE:
					{
						int Val=OrgData[i][j].Discr;
						Estims[j][Class].DiscEst.Count++;
						//j: attribute, Class: label, Val: value of attribute
						Estims[j][Class].DiscEst.AttrCount[Val]++;
					}
					break;
				case ATT_CONTINUOUS:
				case ATT_DATETIME:
					{
						double Val=OrgData[i][j].Cont;
						int ValNo;

						if(OrgInfo.ValidAttrs[j].Max==OrgInfo.ValidAttrs[j].Min)
							ValNo=0;
						else
							ValNo=(int)((OrgData[i][j].Cont-OrgInfo.ValidAttrs[j].Min)*10/
								(OrgInfo.ValidAttrs[j].Max-OrgInfo.ValidAttrs[j].Min));
						if(ValNo>=SplitNum)
							ValNo=SplitNum-1;
						if(ValNo<0)
							ValNo=0;
						Estims[j][Class].ContEst.Vals[ValNo]++;
						Estims[j][Class].ContEst.Count++;
					}
					break;
				default:
					break;
			}
	}//for data

	//get all statistics needed
	for(int i=0;i<OrgInfo.ValidWidth-1;i++)
	{
		switch(OrgInfo.ValidAttrs[i].AttType)
		{
			case ATT_DISCRETE:
				for(int j=0;j<OrgInfo.ClassNum;j++)
				{
					int ValNum=(int)OrgInfo.ValidAttrs[i].Disc.size();
					for(int k=0;k<ValNum;k++)
						Estims[i][j].DiscEst.AttrCount[k]/=Estims[i][j].DiscEst.Count;
				}
				break;
			case ATT_CONTINUOUS:
			case ATT_DATETIME:
				for(int j=0;j<OrgInfo.ClassNum;j++)
				{
					for(int k=0;k<SplitNum;k++)
						Estims[i][j].ContEst.Vals[k]/=Estims[i][j].ContEst.Count;
				}
				break;
			default:
				break;
		}//switch
	}//for attr

	//time consumed
	CreatingTime = (double)(clock() - start) / CLOCKS_PER_SEC;
}
Esempio n. 5
0
void CNaiveBayes::Train(const CDataset &TrainSet)
{
	//start time for training
	clock_t start=clock();

	//data
	const MATRIX &OrgData=TrainSet.GetData();
	const CASE_INFO &OrgInfo=TrainSet.GetInfo();

	//if range of a continuous attribute changed (extended), should we re-calculate all existed statistics?
	//we can't, some information has lost. We can only extend the first and the last intervals

	//statistics
	for(int i=0;i<OrgInfo.Height;i++)
	{
		//label of instance
		int Class=OrgData[i][OrgInfo.ValidWidth-1].Discr;
		//each attribute
		for(int j=0;j<OrgInfo.ValidWidth-1;j++)
			switch(OrgInfo.ValidAttrs[j].AttType)
			{
				case ATT_DISCRETE:
					{
						//value of this attribute
						int Val=OrgData[i][j].Discr;
						Estims[j][Class].DiscEst.Count++;
						//j: attribute, Class: label, Val: value of attribute
						Estims[j][Class].DiscEst.AttrCount[Val]++;
					}
					break;
				case ATT_CONTINUOUS:
				case ATT_DATETIME:
					{
						double Val=OrgData[i][j].Cont;
						int ValNo;

						if(OrgInfo.ValidAttrs[j].Max==OrgInfo.ValidAttrs[j].Min)
							ValNo=0;
						else
							ValNo=(int)((OrgData[i][j].Cont-Estims[j][Class].ContEst.Min)*10/
							(Estims[j][Class].ContEst.Max-Estims[j][Class].ContEst.Min));
						if(ValNo>=SplitNum)
							ValNo=SplitNum-1;
						if(ValNo<0)
							ValNo=0;
						Estims[j][Class].ContEst.Vals[ValNo]++;
						Estims[j][Class].ContEst.Count++;
					}
					break;
				default:
					break;
			}//case: attribute type
	}//for data

	//calculate all other statistics
	for(int i=0;i<OrgInfo.ValidWidth-1;i++)
	{
		switch(OrgInfo.ValidAttrs[i].AttType)
		{
		case ATT_DISCRETE:
			for(int j=0;j<OrgInfo.ClassNum;j++)
			{
				int ValNum=(int)OrgInfo.ValidAttrs[i].Disc.size();
				for(int k=0;k<ValNum;k++)
					Estims[i][j].DiscEst.AttrCount[k]/=Estims[i][j].DiscEst.Count;
			}
			break;
		case ATT_CONTINUOUS:
		case ATT_DATETIME:
			for(int j=0;j<OrgInfo.ClassNum;j++)
			{
				for(int k=0;k<SplitNum;k++)
					Estims[i][j].ContEst.Vals[k]/=Estims[i][j].ContEst.Count;
			}
			break;
		default:
			break;
		}//switch
	}//for attributes

	//time consumed
	CreatingTime+=((double)(clock() - start) / CLOCKS_PER_SEC);
}