Example #1
0
CNaiveBayes::CNaiveBayes(const CDataset &TrainSet,int USplitNum)
{
	Name=MyName;
	SplitNum=USplitNum;
	//start time for training
	clock_t start=clock();

	//data
	const MATRIX &OrgData=TrainSet.GetData();
	const CASE_INFO &OrgInfo=TrainSet.GetInfo();

	//initialize all data structure
	for(int i=0;i<OrgInfo.ValidWidth-1;i++)
	{
		//each attribute
		EstimatorStr Estim;
		Estim.AttType=OrgInfo.ValidAttrs[i].AttType;
		if(Estim.AttType==ATT_DISCRETE)
		{
			//Laplace estimator
			Estim.DiscEst.Count=1;
			int ValNum=(int)OrgInfo.ValidAttrs[i].Disc.size();
			for(int j=0;j<ValNum;j++)
				Estim.DiscEst.AttrCount.push_back(1.0/ValNum);
		}
		//continuous attribute
		else
		{
			//Laplace estimator
			Estim.ContEst.Count=SplitNum;
			Estim.ContEst.Max=OrgInfo.ValidAttrs[i].Max;
			Estim.ContEst.Min=OrgInfo.ValidAttrs[i].Min;
			for(int j=0;j<SplitNum;j++)
				Estim.ContEst.Vals.push_back(1);
		}

		//for each attribute: all class label
		vector<EstimatorStr> EstiAttr;
		for(int j=0;j<OrgInfo.ClassNum;j++)
			EstiAttr.push_back(Estim);
		//all attributes
		Estims.push_back(EstiAttr);
	}

	//statistics
	for(int i=0;i<OrgInfo.Height;i++)
	{
		int Class=OrgData[i][OrgInfo.ValidWidth-1].Discr;
		for(int j=0;j<OrgInfo.ValidWidth-1;j++)
			switch(OrgInfo.ValidAttrs[j].AttType)
			{
				case ATT_DISCRETE:
					{
						int Val=OrgData[i][j].Discr;
						Estims[j][Class].DiscEst.Count++;
						//j: attribute, Class: label, Val: value of attribute
						Estims[j][Class].DiscEst.AttrCount[Val]++;
					}
					break;
				case ATT_CONTINUOUS:
				case ATT_DATETIME:
					{
						double Val=OrgData[i][j].Cont;
						int ValNo;

						if(OrgInfo.ValidAttrs[j].Max==OrgInfo.ValidAttrs[j].Min)
							ValNo=0;
						else
							ValNo=(int)((OrgData[i][j].Cont-OrgInfo.ValidAttrs[j].Min)*10/
								(OrgInfo.ValidAttrs[j].Max-OrgInfo.ValidAttrs[j].Min));
						if(ValNo>=SplitNum)
							ValNo=SplitNum-1;
						if(ValNo<0)
							ValNo=0;
						Estims[j][Class].ContEst.Vals[ValNo]++;
						Estims[j][Class].ContEst.Count++;
					}
					break;
				default:
					break;
			}
	}//for data

	//get all statistics needed
	for(int i=0;i<OrgInfo.ValidWidth-1;i++)
	{
		switch(OrgInfo.ValidAttrs[i].AttType)
		{
			case ATT_DISCRETE:
				for(int j=0;j<OrgInfo.ClassNum;j++)
				{
					int ValNum=(int)OrgInfo.ValidAttrs[i].Disc.size();
					for(int k=0;k<ValNum;k++)
						Estims[i][j].DiscEst.AttrCount[k]/=Estims[i][j].DiscEst.Count;
				}
				break;
			case ATT_CONTINUOUS:
			case ATT_DATETIME:
				for(int j=0;j<OrgInfo.ClassNum;j++)
				{
					for(int k=0;k<SplitNum;k++)
						Estims[i][j].ContEst.Vals[k]/=Estims[i][j].ContEst.Count;
				}
				break;
			default:
				break;
		}//switch
	}//for attr

	//time consumed
	CreatingTime = (double)(clock() - start) / CLOCKS_PER_SEC;
}
Example #2
0
void CNaiveBayes::Train(const CDataset &TrainSet)
{
	//start time for training
	clock_t start=clock();

	//data
	const MATRIX &OrgData=TrainSet.GetData();
	const CASE_INFO &OrgInfo=TrainSet.GetInfo();

	//if range of a continuous attribute changed (extended), should we re-calculate all existed statistics?
	//we can't, some information has lost. We can only extend the first and the last intervals

	//statistics
	for(int i=0;i<OrgInfo.Height;i++)
	{
		//label of instance
		int Class=OrgData[i][OrgInfo.ValidWidth-1].Discr;
		//each attribute
		for(int j=0;j<OrgInfo.ValidWidth-1;j++)
			switch(OrgInfo.ValidAttrs[j].AttType)
			{
				case ATT_DISCRETE:
					{
						//value of this attribute
						int Val=OrgData[i][j].Discr;
						Estims[j][Class].DiscEst.Count++;
						//j: attribute, Class: label, Val: value of attribute
						Estims[j][Class].DiscEst.AttrCount[Val]++;
					}
					break;
				case ATT_CONTINUOUS:
				case ATT_DATETIME:
					{
						double Val=OrgData[i][j].Cont;
						int ValNo;

						if(OrgInfo.ValidAttrs[j].Max==OrgInfo.ValidAttrs[j].Min)
							ValNo=0;
						else
							ValNo=(int)((OrgData[i][j].Cont-Estims[j][Class].ContEst.Min)*10/
							(Estims[j][Class].ContEst.Max-Estims[j][Class].ContEst.Min));
						if(ValNo>=SplitNum)
							ValNo=SplitNum-1;
						if(ValNo<0)
							ValNo=0;
						Estims[j][Class].ContEst.Vals[ValNo]++;
						Estims[j][Class].ContEst.Count++;
					}
					break;
				default:
					break;
			}//case: attribute type
	}//for data

	//calculate all other statistics
	for(int i=0;i<OrgInfo.ValidWidth-1;i++)
	{
		switch(OrgInfo.ValidAttrs[i].AttType)
		{
		case ATT_DISCRETE:
			for(int j=0;j<OrgInfo.ClassNum;j++)
			{
				int ValNum=(int)OrgInfo.ValidAttrs[i].Disc.size();
				for(int k=0;k<ValNum;k++)
					Estims[i][j].DiscEst.AttrCount[k]/=Estims[i][j].DiscEst.Count;
			}
			break;
		case ATT_CONTINUOUS:
		case ATT_DATETIME:
			for(int j=0;j<OrgInfo.ClassNum;j++)
			{
				for(int k=0;k<SplitNum;k++)
					Estims[i][j].ContEst.Vals[k]/=Estims[i][j].ContEst.Count;
			}
			break;
		default:
			break;
		}//switch
	}//for attributes

	//time consumed
	CreatingTime+=((double)(clock() - start) / CLOCKS_PER_SEC);
}