Exemple #1
0
double CAdaBoost::Deviance(const CDataset& kData, const Bag& kBag,
                           const double* kFuncEstimate) {
  double loss = 0.0;
  double weight = 0.0;

  // Switch to validation set if necessary
  unsigned long num_of_rows_in_set = kData.get_size_of_set();

#pragma omp parallel for schedule(static, get_array_chunk_size()) \
    reduction(+ : loss, weight) num_threads(get_num_threads())
  for (unsigned long i = 0; i < num_of_rows_in_set; i++) {
    loss += kData.weight_ptr()[i] *
            std::exp(-(2 * kData.y_ptr()[i] - 1) *
                     (kData.offset_ptr()[i] + kFuncEstimate[i]));
    weight += kData.weight_ptr()[i];
  }

  // TODO: Check if weights are all zero for validation set
  if ((weight == 0.0) && (loss == 0.0)) {
    return nan("");
  } else if (weight == 0.0) {
    return HUGE_VAL;
  }

  return loss / weight;
}
Exemple #2
0
void CAdaBoost::ComputeWorkingResponse(const CDataset& kData, const Bag& kBag,
                                       const double* kFuncEstimate,
                                       std::vector<double>& residuals) {
#pragma omp parallel for schedule(static, get_array_chunk_size()) \
  num_threads(get_num_threads())
  for (unsigned long i = 0; i < kData.get_trainsize(); i++) {
    residuals[i] = -(2 * kData.y_ptr()[i] - 1) *
                   std::exp(-(2 * kData.y_ptr()[i] - 1) *
                            (kData.offset_ptr()[i] + kFuncEstimate[i]));
  }
}
Exemple #3
0
void CPoisson::ComputeWorkingResponse(const CDataset& kData, const Bag& kBag,
                                      const double* kFuncEstimate,
                                      std::vector<double>& residuals) {
// compute working response
#pragma omp parallel for schedule(static, get_array_chunk_size()) \
  num_threads(get_num_threads())
  for (unsigned long i = 0; i < kData.get_trainsize(); i++) {
    const double delta_func_est = kFuncEstimate[i] + kData.offset_ptr()[i];
    residuals[i] = kData.y_ptr()[i] - std::exp(delta_func_est);
  }
}
Exemple #4
0
void CSVM::PrepareData(const CDataset &OrgSet,struct svm_problem &DataDesc)
{
	//for SVM, we need to expand all multivalued discrete attributes of the training data into multi continuous attributes.
	//expand discrete attribute
	const CDataset *TrainSet=&OrgSet;
	if(!OrgSet.AllContinuous())
		TrainSet=OrgSet.ExpandDiscrete();
	const MATRIX &TrainData=TrainSet->GetData();
	const CASE_INFO &CaseInfo=TrainSet->GetInfo();

	//number of attribute for data set
	AttributeNum=CaseInfo.ValidWidth-1;
	//instances is formated as libsvm's requirements
	//number of instances
	DataDesc.l=CaseInfo.Height;
	//labels of instances
	DataDesc.y=new double[DataDesc.l];
	//content of instances (all attributes plus a tag for end of line, each node is initialized as end of a row)
	struct svm_node Val={-1,0};
	fill_d2(struct svm_node,DataDesc.x,CaseInfo.Height,CaseInfo.ValidWidth,Val);
	for(int i=0;i<CaseInfo.Height;i++)
	{
		DataDesc.y[i]=(double)TrainData[i][CaseInfo.ValidWidth-1].Discr;
		int ValidValue=0;
		for(int j=0;j<CaseInfo.ValidWidth-1;j++)
		{
			if(CaseInfo.ValidAttrs[j].AttType==ATT_DISCRETE)
			{
				throw(CError("SVM: discrete attribute should have been expanded!\n",100,0));
			}
			else//range expanding
			{
				if(TrainData[i][j].Cont==0)
					continue;
				else if(CaseInfo.ValidAttrs[j].Max==CaseInfo.ValidAttrs[j].Min)
					continue;
				else
				{
					DataDesc.x[i][ValidValue].index=j+1;
					DataDesc.x[i][ValidValue].value=(TrainData[i][j].Cont-CaseInfo.ValidAttrs[j].Min)/
						(CaseInfo.ValidAttrs[j].Max-CaseInfo.ValidAttrs[j].Min);
					ValidValue++;
				}
			}
		}
		//tag for end of line has been set
	}

	if(!OrgSet.AllContinuous())
		delete TrainSet;
	return;
}
Exemple #5
0
double CPoisson::InitF(const CDataset& kData) {
  double sum = 0.0;
  double denom = 0.0;

#pragma omp parallel for schedule(static, get_array_chunk_size()) \
    reduction(+ : sum, denom) num_threads(get_num_threads())
  for (unsigned long i = 0; i < kData.get_trainsize(); i++) {
    sum += kData.weight_ptr()[i] * kData.y_ptr()[i];
    denom += kData.weight_ptr()[i] * std::exp(kData.offset_ptr()[i]);
  }

  return std::log(sum / denom);
}
Exemple #6
0
void CGaussian::ComputeWorkingResponse(const CDataset& kData, const Bag& kBag,
                                       const double* kFuncEstimate,
                                       std::vector<double>& residuals) {
  if (!(kData.y_ptr() && kFuncEstimate &&
        kData.weight_ptr())) {
    throw gbm_exception::InvalidArgument();
  }

#pragma omp parallel for schedule(static, get_array_chunk_size()) \
  num_threads(get_num_threads())
  for (unsigned long i = 0; i < kData.get_trainsize(); i++) {
    residuals[i] = kData.y_ptr()[i] - kData.offset_ptr()[i] - kFuncEstimate[i];
  }
}
Exemple #7
0
double CGaussian::InitF(const CDataset& kData) {
  double sum = 0.0;
  double totalweight = 0.0;

// compute the mean

#pragma omp parallel for schedule(static, get_array_chunk_size()) \
    reduction(+ : sum, totalweight) num_threads(get_num_threads())
  for (unsigned long i = 0; i < kData.get_trainsize(); i++) {
    sum += kData.weight_ptr()[i] * (kData.y_ptr()[i] - kData.offset_ptr()[i]);
    totalweight += kData.weight_ptr()[i];
  }

  return sum / totalweight;
}
Exemple #8
0
//select the base classifier with the highest accuracy on validation set
CForwardSelect::CForwardSelect(const CEnsemble &UEnsemble,const CDataset &ValidatingSet)
:CEnsemblePruner(UEnsemble)
{
	Name=MyName;
	//Info
	int CaseNum=ValidatingSet.GetInfo().Height;
	int EnsembleSize=Ensemble.GetSize();


	//start time for training
	clock_t start=clock();

	//get prediction
	vector<CPrediction*> *Predictions=Ensemble.AllClassify(ValidatingSet);

	//initialize with no classifier selected
	for(int i=0;i<EnsembleSize;i++)
		Weights.push_back(0);
	//add classifier one by one
	double BestAccr=0;
	for(int i=0;i<EnsembleSize;i++)
	{
		//add the best in each round
		int Best=-1;
		for(int j=0;j<EnsembleSize;j++)
		{
			//skip the one has been selected
			if(Weights[j]>0)continue;
			//add this classifier temporarily
			Weights[j]=1;
			//predicting
			CPrediction *Prediction=Ensemble.Classify(ValidatingSet,*Predictions,Weights);
			double Accuracy=Prediction->GetAccuracy();
			delete Prediction;
			//better accuracy?
			if(Accuracy>BestAccr)
			{
				Best=j;
				BestAccr=Accuracy;
				//if accuracy is 1.0, no better one can be found
				if(Accuracy>=1.0)
					break;
			}
			//recover to the initial state
			Weights[j]=0;
		}
		//if accuracy is 1.0, no better one can be found
		if(BestAccr>=1.0)
			break;
		//select the best one of this round
		if(Best!=-1)
			Weights[Best]=1;
	}

	for(int i=0;i<EnsembleSize;i++)
		delete ((*Predictions)[i]);
	delete Predictions;
	//time consumed
	CreatingTime = (double)(clock() - start) / CLOCKS_PER_SEC;
}
Exemple #9
0
void CPoisson::FitBestConstant(const CDataset& kData, const Bag& kBag,
                               const double* kFuncEstimate,
                               unsigned long num_terminalnodes,
                               std::vector<double>& residuals,
                               CCARTTree& tree) {
  unsigned long obs_num = 0;
  unsigned long node_num = 0;
  vector<double> numerator_vec(num_terminalnodes, 0.0);
  vector<double> denominator_vec(num_terminalnodes, 0.0);
  vector<double> max_vec(num_terminalnodes, -HUGE_VAL);
  vector<double> min_vec(num_terminalnodes, HUGE_VAL);

  for (obs_num = 0; obs_num < kData.get_trainsize(); obs_num++) {
    if (kBag.get_element(obs_num)) {
      numerator_vec[tree.get_node_assignments()[obs_num]] +=
          kData.weight_ptr()[obs_num] * kData.y_ptr()[obs_num];
      denominator_vec[tree.get_node_assignments()[obs_num]] +=
          kData.weight_ptr()[obs_num] *
          std::exp(kData.offset_ptr()[obs_num] + kFuncEstimate[obs_num]);
    }
  }

  for (node_num = 0; node_num < num_terminalnodes; node_num++) {
    if (tree.has_node(node_num)) {
      if (numerator_vec[node_num] == 0.0) {
        // DEBUG: if vecdNum==0 then prediction = -Inf
        // Not sure what else to do except plug in an arbitrary
        //   negative number, -1? -10? Let's use -1, then make
        //   sure |adF| < 19 always.
        tree.get_terminal_nodes()[node_num]->set_prediction(-19.0);
      } else if (denominator_vec[node_num] == 0.0) {
        tree.get_terminal_nodes()[node_num]->set_prediction(0.0);
      } else {
        tree.get_terminal_nodes()[node_num]->set_prediction(
            std::log(numerator_vec[node_num] / denominator_vec[node_num]));
      }
      tree.get_terminal_nodes()[node_num]->set_prediction(
          R::fmin2(tree.get_terminal_nodes()[node_num]->get_prediction(),
                   19 - max_vec[node_num]));
      tree.get_terminal_nodes()[node_num]->set_prediction(
          R::fmax2(tree.get_terminal_nodes()[node_num]->get_prediction(),
                   -19 - min_vec[node_num]));
    }
  }
}
Exemple #10
0
double CGaussian::BagImprovement(const CDataset& kData, const Bag& kBag,
                                 const double* kFuncEstimate,
                                 const double kShrinkage,
                                 const std::vector<double>& kDeltaEstimate) {
  double returnvalue = 0.0;
  double weight = 0.0;

#pragma omp parallel for schedule(static, get_array_chunk_size()) \
    reduction(+ : returnvalue, weight) num_threads(get_num_threads())
  for (unsigned long i = 0; i < kData.get_trainsize(); i++) {
    if (!kBag.get_element(i)) {
      const double deltafunc_est = kFuncEstimate[i] + kData.offset_ptr()[i];

      returnvalue += kData.weight_ptr()[i] * kShrinkage * kDeltaEstimate[i] *
                     (2.0 * (kData.y_ptr()[i] - deltafunc_est) -
                      kShrinkage * kDeltaEstimate[i]);
      weight += kData.weight_ptr()[i];
    }
  }

  return returnvalue / weight;
}
Exemple #11
0
double CGaussian::Deviance(const CDataset& kData, const Bag& kBag,
                           const double* kFuncEstimate) {
  double loss = 0.0;
  double weight = 0.0;

  unsigned long num_rows_in_set = kData.get_size_of_set();
#pragma omp parallel for schedule(static, get_array_chunk_size()) \
    reduction(+ : loss, weight) num_threads(get_num_threads())
  for (unsigned long i = 0; i < num_rows_in_set; i++) {
    const double tmp =
        (kData.y_ptr()[i] - kData.offset_ptr()[i] - kFuncEstimate[i]);
    loss += kData.weight_ptr()[i] * tmp * tmp;
    weight += kData.weight_ptr()[i];
  }

  // TODO: Check if weights are all zero for validation set
  if ((weight == 0.0) && (loss == 0.0)) {
    return nan("");
  } else if (weight == 0.0) {
    return copysign(HUGE_VAL, loss);
  }

  return loss / weight;
}
Exemple #12
0
double CAdaBoost::InitF(const CDataset& kData) {
  double numerator = 0.0;
  double denominator = 0.0;

#pragma omp parallel for schedule(static, get_array_chunk_size()) \
    reduction(+ : numerator, denominator) num_threads(get_num_threads())
  for (unsigned long i = 0; i < kData.get_trainsize(); i++) {
    if (kData.y_ptr()[i] == 1.0) {
      numerator += kData.weight_ptr()[i] * std::exp(-kData.offset_ptr()[i]);
    } else {
      denominator += kData.weight_ptr()[i] * std::exp(kData.offset_ptr()[i]);
    }
  }

  return 0.5 * std::log(numerator / denominator);
}
Exemple #13
0
void CAdaBoost::FitBestConstant(const CDataset& kData, const Bag& kBag,
                                const double* kFuncEstimate,
                                unsigned long num_terminalnodes,
                                std::vector<double>& residuals,
                                CCARTTree& tree) {
  unsigned long obs_num = 0;
  unsigned long node_num = 0;
  numerator_bestconstant_.resize(num_terminalnodes);
  numerator_bestconstant_.assign(numerator_bestconstant_.size(), 0.0);
  denominator_bestconstant_.resize(num_terminalnodes);
  denominator_bestconstant_.assign(denominator_bestconstant_.size(), 0.0);

  for (obs_num = 0; obs_num < kData.get_trainsize(); obs_num++) {
    if (kBag.get_element(obs_num)) {
      const double deltafunc_est =
          kFuncEstimate[obs_num] + kData.offset_ptr()[obs_num];
      numerator_bestconstant_[tree.get_node_assignments()[obs_num]] +=
          kData.weight_ptr()[obs_num] * (2 * kData.y_ptr()[obs_num] - 1) *
          std::exp(-(2 * kData.y_ptr()[obs_num] - 1) * deltafunc_est);
      denominator_bestconstant_[tree.get_node_assignments()[obs_num]] +=
          kData.weight_ptr()[obs_num] *
          std::exp(-(2 * kData.y_ptr()[obs_num] - 1) * deltafunc_est);
    }
  }

  for (node_num = 0; node_num < num_terminalnodes; node_num++) {
    if (tree.has_node(node_num)) {
      if (denominator_bestconstant_[node_num] == 0) {
        tree.get_terminal_nodes()[node_num]->set_prediction(0.0);
      } else {
        tree.get_terminal_nodes()[node_num]->set_prediction(
            numerator_bestconstant_[node_num] /
            denominator_bestconstant_[node_num]);
      }
    }
  }
}
Exemple #14
0
void CNaiveBayes::Train(const CDataset &TrainSet)
{
	//start time for training
	clock_t start=clock();

	//data
	const MATRIX &OrgData=TrainSet.GetData();
	const CASE_INFO &OrgInfo=TrainSet.GetInfo();

	//if range of a continuous attribute changed (extended), should we re-calculate all existed statistics?
	//we can't, some information has lost. We can only extend the first and the last intervals

	//statistics
	for(int i=0;i<OrgInfo.Height;i++)
	{
		//label of instance
		int Class=OrgData[i][OrgInfo.ValidWidth-1].Discr;
		//each attribute
		for(int j=0;j<OrgInfo.ValidWidth-1;j++)
			switch(OrgInfo.ValidAttrs[j].AttType)
			{
				case ATT_DISCRETE:
					{
						//value of this attribute
						int Val=OrgData[i][j].Discr;
						Estims[j][Class].DiscEst.Count++;
						//j: attribute, Class: label, Val: value of attribute
						Estims[j][Class].DiscEst.AttrCount[Val]++;
					}
					break;
				case ATT_CONTINUOUS:
				case ATT_DATETIME:
					{
						double Val=OrgData[i][j].Cont;
						int ValNo;

						if(OrgInfo.ValidAttrs[j].Max==OrgInfo.ValidAttrs[j].Min)
							ValNo=0;
						else
							ValNo=(int)((OrgData[i][j].Cont-Estims[j][Class].ContEst.Min)*10/
							(Estims[j][Class].ContEst.Max-Estims[j][Class].ContEst.Min));
						if(ValNo>=SplitNum)
							ValNo=SplitNum-1;
						if(ValNo<0)
							ValNo=0;
						Estims[j][Class].ContEst.Vals[ValNo]++;
						Estims[j][Class].ContEst.Count++;
					}
					break;
				default:
					break;
			}//case: attribute type
	}//for data

	//calculate all other statistics
	for(int i=0;i<OrgInfo.ValidWidth-1;i++)
	{
		switch(OrgInfo.ValidAttrs[i].AttType)
		{
		case ATT_DISCRETE:
			for(int j=0;j<OrgInfo.ClassNum;j++)
			{
				int ValNum=(int)OrgInfo.ValidAttrs[i].Disc.size();
				for(int k=0;k<ValNum;k++)
					Estims[i][j].DiscEst.AttrCount[k]/=Estims[i][j].DiscEst.Count;
			}
			break;
		case ATT_CONTINUOUS:
		case ATT_DATETIME:
			for(int j=0;j<OrgInfo.ClassNum;j++)
			{
				for(int k=0;k<SplitNum;k++)
					Estims[i][j].ContEst.Vals[k]/=Estims[i][j].ContEst.Count;
			}
			break;
		default:
			break;
		}//switch
	}//for attributes

	//time consumed
	CreatingTime+=((double)(clock() - start) / CLOCKS_PER_SEC);
}
Exemple #15
0
std::auto_ptr<CDistribution> gbm_setup
(
 const CDataset& data,
 const std::string& family,
 int cTrees,
 int cDepth,
 int cMinObsInNode,
 int cNumClasses,
 double dShrinkage,
 double dBagFraction,
 int cTrain,
 int cFeatures,
 int& cGroups
 )
{
  std::auto_ptr<CDistribution> pDist;
  cGroups = -1;
  
    // set the distribution
  if (family == "gamma") {
    pDist.reset(new CGamma());
  } 
  else if (family == "tweedie") {
    pDist.reset(new CTweedie(data.misc_ptr()[0]));
  }
  else if (family == "bernoulli") 
    {
      pDist.reset(new CBernoulli());
    }
  else if (family == "gaussian") 
    {
      pDist.reset(new CGaussian());
    }
  else if (family == "poisson")
    {
      pDist.reset(new CPoisson());
    }
  else if (family == "adaboost")
    {
      pDist.reset(new CAdaBoost());
    }
  else if (family == "coxph")
    {
      pDist.reset(new CCoxPH());
    }
  else if (family == "laplace")
    {
      pDist.reset(new CLaplace());
    }
  else if (family == "quantile")
    {
      pDist.reset(new CQuantile(data.misc_ptr()[0]));
    }
  else if (family == "tdist")
    {
      pDist.reset(new CTDist(data.misc_ptr()[0]));
    }
  else if (family == "multinomial")
    {
      pDist.reset(new CMultinomial(cNumClasses, data.nrow()));
    }
  else if (family == "huberized")
    {
      pDist.reset(new CHuberized());
    }
  else if (family == "pairwise_conc")
    {
      pDist.reset(new CPairwise("conc"));
    }
  else if (family == "pairwise_ndcg")
    {
      pDist.reset(new CPairwise("ndcg"));
    }
  else if (family == "pairwise_map")
    {
      pDist.reset(new CPairwise("map"));
    }
  else if (family == "pairwise_mrr")
    {
      pDist.reset(new CPairwise("mrr"));
    }
  else
    {
      throw GBM::invalid_argument();
    }

  if (0==family.compare(0, 8, "pairwise")) 
    {
      cGroups = num_groups(data.misc_ptr(), cTrain);
    }
  
  return pDist;
}
Exemple #16
0
//CaseClassTab: 
int CPMEP::BuildCaseClassTab(vector<CaseClassArrayStr> &CaseClassTab,const CDataset &ValidatingSet,
	const vector<CPrediction*> &Predictions)
{
	//Info
	int CaseNum=ValidatingSet.GetInfo().Height;
	int EnsembleSize=Ensemble.GetSize();
	if(Predictions[0]->GetCaseNum()!=CaseNum)
	{
		printf("DataSet->height!=BpnnResult->CaseNum");
		return 1;
	}

	//construct and initialize the table
	CaseClassTab.clear();
	{
		//each instance a row
		//row
		vector<CaseClassRecStr> CaseClassArray;
		//item: each classifier a column
		CaseClassRecStr CaseClassRec;
		CaseClassRec.Correct=0;
//		CaseClassRec.NodeLink=NULL;
		//total column: classifier number +1
		//last column: number of classifiers that predict this instance correctly
		for(int k=0;k<=EnsembleSize;k++)
		{
			CaseClassRec.Classifier=k;
			CaseClassArray.push_back(CaseClassRec);
		}

		//total row=CaseNum+1
		//Last row: number of instances predicted correctly by this classifier and id of it
		for(int j=0;j<=CaseNum;j++)
			CaseClassTab.push_back(CaseClassArray);
	}

	//fill it
	for(int i=0;i<EnsembleSize;i++)
	{
		//
		for(int j=0;j<CaseNum;j++)
		{
			//is this prediction correct?
			if(Predictions[i]->GetCorrectness()[j])
			{
				if(CaseClassTab[j][i].Correct!=0)
				{
					printf("CaseClassTab[j][i].Correct!=0");
					return 2;
				}
				CaseClassTab[j][i].Correct++;
				//last column: number of classifiers that predict this instance correctly
				CaseClassTab[j][EnsembleSize].Correct++;
				//last row: number of instances correctly predicted by this classifier
				CaseClassTab[CaseNum][i].Correct++;
			}
		}
	}

	//sort the columns of the last row by descent order of corresponding classifiers' prediction accuracy
	sort(CaseClassTab[CaseNum].begin(),CaseClassTab[CaseNum].end(),CorrectDescOrder);
//	Dump("a.txt",CaseClassTab);
	//sort columns of other rows as the order of the last row
	for(int i=0;i<EnsembleSize;i++)
	{
		//are the left classifiers incorrectly predict all instances?
		if(CaseClassTab[CaseNum][i].Correct==0)break;
		//find each column's new position
		int k;
		for(k=i;k<EnsembleSize;k++)
			if(CaseClassTab[0][k].Classifier==CaseClassTab[CaseNum][i].Classifier)
				break;
		//don't need to change position?
		if(k==i)
			continue;

		//switch to new position(k -> i)
		CaseClassRecStr TempCaseClassRec;
		for(int j=0;j<CaseNum;j++)
		{
			TempCaseClassRec=CaseClassTab[j][i];
			CaseClassTab[j][i]=CaseClassTab[j][k];
			CaseClassTab[j][k]=TempCaseClassRec;
		}
	}
//	Dump("a.txt",CaseClassTab);

	return 0;
}
Exemple #17
0
CNaiveBayes::CNaiveBayes(const CDataset &TrainSet,int USplitNum)
{
	Name=MyName;
	SplitNum=USplitNum;
	//start time for training
	clock_t start=clock();

	//data
	const MATRIX &OrgData=TrainSet.GetData();
	const CASE_INFO &OrgInfo=TrainSet.GetInfo();

	//initialize all data structure
	for(int i=0;i<OrgInfo.ValidWidth-1;i++)
	{
		//each attribute
		EstimatorStr Estim;
		Estim.AttType=OrgInfo.ValidAttrs[i].AttType;
		if(Estim.AttType==ATT_DISCRETE)
		{
			//Laplace estimator
			Estim.DiscEst.Count=1;
			int ValNum=(int)OrgInfo.ValidAttrs[i].Disc.size();
			for(int j=0;j<ValNum;j++)
				Estim.DiscEst.AttrCount.push_back(1.0/ValNum);
		}
		//continuous attribute
		else
		{
			//Laplace estimator
			Estim.ContEst.Count=SplitNum;
			Estim.ContEst.Max=OrgInfo.ValidAttrs[i].Max;
			Estim.ContEst.Min=OrgInfo.ValidAttrs[i].Min;
			for(int j=0;j<SplitNum;j++)
				Estim.ContEst.Vals.push_back(1);
		}

		//for each attribute: all class label
		vector<EstimatorStr> EstiAttr;
		for(int j=0;j<OrgInfo.ClassNum;j++)
			EstiAttr.push_back(Estim);
		//all attributes
		Estims.push_back(EstiAttr);
	}

	//statistics
	for(int i=0;i<OrgInfo.Height;i++)
	{
		int Class=OrgData[i][OrgInfo.ValidWidth-1].Discr;
		for(int j=0;j<OrgInfo.ValidWidth-1;j++)
			switch(OrgInfo.ValidAttrs[j].AttType)
			{
				case ATT_DISCRETE:
					{
						int Val=OrgData[i][j].Discr;
						Estims[j][Class].DiscEst.Count++;
						//j: attribute, Class: label, Val: value of attribute
						Estims[j][Class].DiscEst.AttrCount[Val]++;
					}
					break;
				case ATT_CONTINUOUS:
				case ATT_DATETIME:
					{
						double Val=OrgData[i][j].Cont;
						int ValNo;

						if(OrgInfo.ValidAttrs[j].Max==OrgInfo.ValidAttrs[j].Min)
							ValNo=0;
						else
							ValNo=(int)((OrgData[i][j].Cont-OrgInfo.ValidAttrs[j].Min)*10/
								(OrgInfo.ValidAttrs[j].Max-OrgInfo.ValidAttrs[j].Min));
						if(ValNo>=SplitNum)
							ValNo=SplitNum-1;
						if(ValNo<0)
							ValNo=0;
						Estims[j][Class].ContEst.Vals[ValNo]++;
						Estims[j][Class].ContEst.Count++;
					}
					break;
				default:
					break;
			}
	}//for data

	//get all statistics needed
	for(int i=0;i<OrgInfo.ValidWidth-1;i++)
	{
		switch(OrgInfo.ValidAttrs[i].AttType)
		{
			case ATT_DISCRETE:
				for(int j=0;j<OrgInfo.ClassNum;j++)
				{
					int ValNum=(int)OrgInfo.ValidAttrs[i].Disc.size();
					for(int k=0;k<ValNum;k++)
						Estims[i][j].DiscEst.AttrCount[k]/=Estims[i][j].DiscEst.Count;
				}
				break;
			case ATT_CONTINUOUS:
			case ATT_DATETIME:
				for(int j=0;j<OrgInfo.ClassNum;j++)
				{
					for(int k=0;k<SplitNum;k++)
						Estims[i][j].ContEst.Vals[k]/=Estims[i][j].ContEst.Count;
				}
				break;
			default:
				break;
		}//switch
	}//for attr

	//time consumed
	CreatingTime = (double)(clock() - start) / CLOCKS_PER_SEC;
}
Exemple #18
0
CPMEP::CPMEP(const CEnsemble &UEnsemble,const CDataset &ValidatingSet,const vector<CPrediction*> &Predictions)
:CEnsemblePruner(UEnsemble)
{
	Name=MyName;
	if(Ensemble.GetSize()!=(int)Predictions.size())
		throw(CError("CPMEP: wrong size for user-defined weights!",100,0));
	//Info
	int CaseNum=ValidatingSet.GetInfo().Height;
	int EnsembleSize=Ensemble.GetSize();
	//start time for training
	clock_t start=clock();

	//table: instance-classifier-prediction
	vector<CaseClassArrayStr> CaseClassTab;
	BuildCaseClassTab(CaseClassTab,ValidatingSet,Predictions);
//	Dump("c.txt",CaseClassTab);
	//construct FP-tree
	TreeNodeStr Root;
	BuildFPTree(Root,CaseClassTab,EnsembleSize);
	
	vector<SelClassifierStr> SelClassifiers;
	//k: number of classifiers to be selected
	for(int k=1;k<=EnsembleSize/2*2+1;k+=2)
	{
		//path-table: paths with length of k/2+1
		vector<TreePathStr> TreePathTable;
		TreePathStr TreePath;
		BuildPathTab(TreePathTable,Root,TreePath,k/2+1);
//		Dump("cc.txt",TreePathTable);

		//selected classifier (no more than k)
		SelClassifierStr S,TempS;
		S.Count=0;
		//add paths until path-table is empty
		while((int)TreePathTable.size()>0 && (int)S.Set.size()<k)
		{
			//sort all paths by Count value and number of classifiers
			sort(TreePathTable.begin(),TreePathTable.end(),ClassNumOrder);
			stable_sort(TreePathTable.begin(),TreePathTable.end(),CountDescOrder);
//			Dump("TreePathTable.txt",TreePathTable);

			//temporally select all classifier of the first path
			TempS=S;
			TempS.Count+=TreePathTable[0].Count;
			for(int j=0;j<(int)TreePathTable[0].Classifiers.size();j++)
				TempS.Set.insert(TreePathTable[0].Classifiers[j]);

			//total size
			if((int)TempS.Set.size()<=k)
			{
				S=TempS;

				//remove classifiers of selected path from all rows of path-table
				for(int jj=0;jj<(int)TreePathTable[0].Classifiers.size();jj++)
				{
					for(int i=1;i<(int)TreePathTable.size();i++)
						for(int j=0;j<(int)TreePathTable[i].Classifiers.size();j++)
							if(TreePathTable[i].Classifiers[j]==TreePathTable[0].Classifiers[jj])
							{
								TreePathTable[i].Classifiers.erase(TreePathTable[i].Classifiers.begin()+j);
								break;
							}
				}
//				Dump("TreePathTable.txt",TreePathTable);

				//remove empty row from path-table
				for(int i=1;i<(int)TreePathTable.size();)
				{
					if(TreePathTable[i].Classifiers.size()<=0)
					{
						//the Count value of the path being removed is added
						TempS.Count+=TreePathTable[i].Count;

						TreePathTable.erase(TreePathTable.begin()+i);
						continue;
					}
					i++;
				}
				//this path is finished
				TreePathTable.erase(TreePathTable.begin());
//				Dump("TreePathTable.txt",TreePathTable);

				//merge same paths
				for(int i=0;i<(int)TreePathTable.size();i++)
				{
					set<int> A0;
					for(int jj=0;jj<(int)TreePathTable[i].Classifiers.size();jj++)
						A0.insert(TreePathTable[i].Classifiers[jj]);

					for(int j=i+1;j<(int)TreePathTable.size();)
					{
						set<int> A1;
						for(int jj=0;jj<(int)TreePathTable[j].Classifiers.size();jj++)
							A1.insert(TreePathTable[j].Classifiers[jj]);
						if(A0==A1)
						{
							TreePathTable[i].Count+=TreePathTable[j].Count;
							TreePathTable.erase(TreePathTable.begin()+j);
							continue;
						}
						j++;
					}
				}//for i
//				Dump("TreePathTable.txt",TreePathTable);
			}
			else//adding this path will make the size of selected classifiers greater than k, so skip it
				TreePathTable.erase(TreePathTable.begin());
		}//while
		SelClassifiers.push_back(S);

	}//for k
//	deltree(&&Root);

	//sort all sets by Count and size
	sort(SelClassifiers.begin(),SelClassifiers.end(),SelClassSetSizeOrder);
	stable_sort(SelClassifiers.begin(),SelClassifiers.end(),SelClassCountDescOrder);

	//set the weight of selected classifiers
	for(int i=0;i<EnsembleSize;i++)
		Weights.push_back(0);
	set<int>::iterator	i_Classifier;
	for(i_Classifier=SelClassifiers[0].Set.begin();i_Classifier!=SelClassifiers[0].Set.end();i_Classifier++)
	{
		for(int i=0;i<EnsembleSize;i++)
			if(*i_Classifier==i)
				Weights[i]=1.0;
	}

	//time consumed
	CreatingTime = (double)(clock() - start) / CLOCKS_PER_SEC;
}