CNaiveBayes::CNaiveBayes(const CDataset &TrainSet,int USplitNum) { Name=MyName; SplitNum=USplitNum; //start time for training clock_t start=clock(); //data const MATRIX &OrgData=TrainSet.GetData(); const CASE_INFO &OrgInfo=TrainSet.GetInfo(); //initialize all data structure for(int i=0;i<OrgInfo.ValidWidth-1;i++) { //each attribute EstimatorStr Estim; Estim.AttType=OrgInfo.ValidAttrs[i].AttType; if(Estim.AttType==ATT_DISCRETE) { //Laplace estimator Estim.DiscEst.Count=1; int ValNum=(int)OrgInfo.ValidAttrs[i].Disc.size(); for(int j=0;j<ValNum;j++) Estim.DiscEst.AttrCount.push_back(1.0/ValNum); } //continuous attribute else { //Laplace estimator Estim.ContEst.Count=SplitNum; Estim.ContEst.Max=OrgInfo.ValidAttrs[i].Max; Estim.ContEst.Min=OrgInfo.ValidAttrs[i].Min; for(int j=0;j<SplitNum;j++) Estim.ContEst.Vals.push_back(1); } //for each attribute: all class label vector<EstimatorStr> EstiAttr; for(int j=0;j<OrgInfo.ClassNum;j++) EstiAttr.push_back(Estim); //all attributes Estims.push_back(EstiAttr); } //statistics for(int i=0;i<OrgInfo.Height;i++) { int Class=OrgData[i][OrgInfo.ValidWidth-1].Discr; for(int j=0;j<OrgInfo.ValidWidth-1;j++) switch(OrgInfo.ValidAttrs[j].AttType) { case ATT_DISCRETE: { int Val=OrgData[i][j].Discr; Estims[j][Class].DiscEst.Count++; //j: attribute, Class: label, Val: value of attribute Estims[j][Class].DiscEst.AttrCount[Val]++; } break; case ATT_CONTINUOUS: case ATT_DATETIME: { double Val=OrgData[i][j].Cont; int ValNo; if(OrgInfo.ValidAttrs[j].Max==OrgInfo.ValidAttrs[j].Min) ValNo=0; else ValNo=(int)((OrgData[i][j].Cont-OrgInfo.ValidAttrs[j].Min)*10/ (OrgInfo.ValidAttrs[j].Max-OrgInfo.ValidAttrs[j].Min)); if(ValNo>=SplitNum) ValNo=SplitNum-1; if(ValNo<0) ValNo=0; Estims[j][Class].ContEst.Vals[ValNo]++; Estims[j][Class].ContEst.Count++; } break; default: break; } }//for data //get all statistics needed for(int i=0;i<OrgInfo.ValidWidth-1;i++) { switch(OrgInfo.ValidAttrs[i].AttType) { case ATT_DISCRETE: for(int j=0;j<OrgInfo.ClassNum;j++) { int ValNum=(int)OrgInfo.ValidAttrs[i].Disc.size(); for(int k=0;k<ValNum;k++) Estims[i][j].DiscEst.AttrCount[k]/=Estims[i][j].DiscEst.Count; } break; case ATT_CONTINUOUS: case ATT_DATETIME: for(int j=0;j<OrgInfo.ClassNum;j++) { for(int k=0;k<SplitNum;k++) Estims[i][j].ContEst.Vals[k]/=Estims[i][j].ContEst.Count; } break; default: break; }//switch }//for attr //time consumed CreatingTime = (double)(clock() - start) / CLOCKS_PER_SEC; }
void CNaiveBayes::Train(const CDataset &TrainSet) { //start time for training clock_t start=clock(); //data const MATRIX &OrgData=TrainSet.GetData(); const CASE_INFO &OrgInfo=TrainSet.GetInfo(); //if range of a continuous attribute changed (extended), should we re-calculate all existed statistics? //we can't, some information has lost. We can only extend the first and the last intervals //statistics for(int i=0;i<OrgInfo.Height;i++) { //label of instance int Class=OrgData[i][OrgInfo.ValidWidth-1].Discr; //each attribute for(int j=0;j<OrgInfo.ValidWidth-1;j++) switch(OrgInfo.ValidAttrs[j].AttType) { case ATT_DISCRETE: { //value of this attribute int Val=OrgData[i][j].Discr; Estims[j][Class].DiscEst.Count++; //j: attribute, Class: label, Val: value of attribute Estims[j][Class].DiscEst.AttrCount[Val]++; } break; case ATT_CONTINUOUS: case ATT_DATETIME: { double Val=OrgData[i][j].Cont; int ValNo; if(OrgInfo.ValidAttrs[j].Max==OrgInfo.ValidAttrs[j].Min) ValNo=0; else ValNo=(int)((OrgData[i][j].Cont-Estims[j][Class].ContEst.Min)*10/ (Estims[j][Class].ContEst.Max-Estims[j][Class].ContEst.Min)); if(ValNo>=SplitNum) ValNo=SplitNum-1; if(ValNo<0) ValNo=0; Estims[j][Class].ContEst.Vals[ValNo]++; Estims[j][Class].ContEst.Count++; } break; default: break; }//case: attribute type }//for data //calculate all other statistics for(int i=0;i<OrgInfo.ValidWidth-1;i++) { switch(OrgInfo.ValidAttrs[i].AttType) { case ATT_DISCRETE: for(int j=0;j<OrgInfo.ClassNum;j++) { int ValNum=(int)OrgInfo.ValidAttrs[i].Disc.size(); for(int k=0;k<ValNum;k++) Estims[i][j].DiscEst.AttrCount[k]/=Estims[i][j].DiscEst.Count; } break; case ATT_CONTINUOUS: case ATT_DATETIME: for(int j=0;j<OrgInfo.ClassNum;j++) { for(int k=0;k<SplitNum;k++) Estims[i][j].ContEst.Vals[k]/=Estims[i][j].ContEst.Count; } break; default: break; }//switch }//for attributes //time consumed CreatingTime+=((double)(clock() - start) / CLOCKS_PER_SEC); }