Exemplo n.º 1
0
void DecisionTree::computeTree(Pool& pool)
{
    double max =0;
    int varChosen=0;

    std::vector<int> occurrencesInResp(pool.levelSize());

    for(int j=0;j<pool.levelSize();j++)
        occurrencesInResp[j] = pool.idxs(j).size();

    double firstTerm = entropy(occurrencesInResp,pool.sampleSize());

    for(int i=0;i<matComp.size();i++)
    {
        double calc = infoGain(matComp[i],pool,firstTerm);
        if(calc>=max)
        {
            max = calc;
            varChosen = i;
        }
    }
    if(max>0)
    {
        pool.setVarIdx(varChosen);
        pool.setInfoGain(max);
        int splits = matComp[varChosen].levelSize();

        for(int j=0;j<splits;j++)
        {
            std::vector<int> descIdxs = matComp[varChosen].idxs(j);
            std::vector<int> respIdxs = pool.allIdxs();
            std::vector<int> intersec;
            intersec.reserve(pool.sampleSize());
            std::set_intersection(descIdxs.begin(),descIdxs.end(),respIdxs.begin(),
                                    respIdxs.end(),std::back_inserter(intersec));

            pool.add_node(Pool(intersec,pool.labsFromIdx(intersec),matComp[varChosen].getLevel(j)));

            Pool& poolToCheck = pool.getNext(j);
            if(poolToCheck.levelSize()<=1)
                continue;
            else
                computeTree(poolToCheck);
        }
    }
}
Exemplo n.º 2
0
void RandomForest::recAssignNode(Node* pNode, vector<pair<int,int> >& idx_orig, string pre, int depth, int& node_cnt, double& tree_gini) {
   double best_ig=-1;
   int best_attr=-1;
   bool rand_best_f = false;
   double best_cutVal=0.5;
   vector<vector<int> > best_a_l_cnt(2,vector<int>(2,0));
   vector<pair<int,int> > idx0, idx1;
   idx0.empty();
   idx1.empty();
   pNode->node0 = NULL;
   pNode->node1 = NULL;

   vector<int> features;
   int feature_sample_cnt = (int)((double)base_feature_sample_cnt_ * pow(progressive_factor_,depth));
   if(feature_sample_cnt > good_features_.size()) feature_sample_cnt = good_features_.size();
   create_feature_list(features, feature_sample_cnt, good_features_);
   if(debug && 0) {
      int last=-1;
      for(int i=0; i<features.size(); i++) {
         if(features[i] == last) cerr << endl << "ERROR: repeat feature" << endl;
         last = features[i];
         cerr << features[i] << ",";
      }
      cerr << endl;
   }
   vector<vector<int> > a_l_cnt(2,vector<int>(2,0));
   double cutVal;
   double ig=-1;
   int approach = 1;
   vector<f_ig_cut> vBestF;
   int step_size = (int)(0.01*(double)idx_orig.size());
   if(step_size < 1) step_size = 1;
   for(int attr_idx = 0; attr_idx < feature_sample_cnt; attr_idx++) {
      //double cutVal = (double)((rand() % 1000)+1)/(double)(1001);
      //cerr << features[attr_idx] << ":" << pXD_->attr_class[features[attr_idx]] << ":" << pXD_->attr_class.size() << endl;
      if(pXD_->attr_class[features[attr_idx]] == 1 and approach == 1) {
         double best_ig = -1;
         vector<pair<int,double> > attr_sort_idx(idx_orig.size(),pair<int,double>(0,0.0));
         a_l_cnt.assign(2,vector<int>(2,0));
         for(int exi = 0; exi < idx_orig.size(); exi++) {
            attr_sort_idx[exi] = pair<int,double>(exi,pXD_->trn_attr[idx_orig[exi].first][features[attr_idx]]);
            a_l_cnt[1][(int)pXD_->trn_labl[idx_orig[exi].first]] += idx_orig[exi].second;
         }
         sort(attr_sort_idx.begin(),attr_sort_idx.end(),idx_double_sort);
         int ex_id, ex_id_next, ex_cnt, labl;
         double attr_val;
         for(int exii = 0; exii < (attr_sort_idx.size()-step_size); exii++) {
            ex_id = idx_orig[attr_sort_idx[exii].first].first;
            ex_id_next = idx_orig[attr_sort_idx[exii+step_size].first].first;
            ex_cnt = idx_orig[attr_sort_idx[exii].first].second;
            labl = (int)pXD_->trn_labl[ex_id];
            attr_val = pXD_->trn_attr[ex_id][features[attr_idx]];
            a_l_cnt[0][labl] += ex_cnt;
            a_l_cnt[1][labl] -= ex_cnt;
            if (pXD_->trn_attr[ex_id_next][features[attr_idx]] == attr_val || (exii % step_size) != 0) continue;
            ig = infoGain(a_l_cnt);
            if(ig>best_ig) {
               best_ig = ig;
               cutVal = attr_val;  //be sure to cut at <= cutVal or > cutVal
            }
         }
         ig = best_ig;
      } else {
         a_l_cnt.assign(2,vector<int>(2,0));
         if(approach == 1 || approach == 2) {
            cutVal = pXD_->attr_avg[features[attr_idx]];
         } else {
            cutVal = (double)((rand() % 1000)+1)/(double)(1001);
         }
         int attr_split;
         for(int exi = 0; exi < idx_orig.size(); exi++) {
            attr_split = (pXD_->trn_attr[idx_orig[exi].first][features[attr_idx]] > cutVal) ? 1 : 0;
            a_l_cnt[attr_split][(int)pXD_->trn_labl[idx_orig[exi].first]] += idx_orig[exi].second;
         }
         ig = infoGain(a_l_cnt);
      }

      if(rand_best_f) {
         f_ig_cut newcut;
         newcut.ig = ig;
         newcut.attr = features[attr_idx];
         newcut.cutVal = cutVal;
         newcut.a_l_cnt = a_l_cnt;
         vBestF.push_back(newcut);
      }

      if(ig > best_ig) {
         best_ig = ig;
         best_attr = features[attr_idx];
         best_cutVal = cutVal;
         best_a_l_cnt = a_l_cnt;
      }
      //cerr << features[attr_idx] << "," << pXD_->attr_class[features[attr_idx]] << "," << ig << endl;
   }
   if(rand_best_f) {
      sort(vBestF.begin(), vBestF.end(), f_ig_cut_sort);
      int id = (int)(pow((double)(rand() % 1000)/1000.0,11) * 3 /*vBestF.size()*/); 
      //cerr << id << endl;
      //cerr << vBestF[0].ig << "," << best_ig << endl;
      best_ig = vBestF[id].ig;
      best_attr = vBestF[id].attr;
      best_cutVal = vBestF[id].cutVal;
      best_a_l_cnt = vBestF[id].a_l_cnt;
   }

   pNode->ig = best_ig;
   pNode->attr_id = best_attr;
   pNode->cutVal = best_cutVal;
   pNode->a_l_cnt = best_a_l_cnt;

   int cnt_a1=0, cnt_a0=0, onesCnt_a1=0, onesCnt_a0=0;
   for(int exi = 0; exi < idx_orig.size(); exi++) {
      if(pXD_->trn_attr[idx_orig[exi].first][best_attr] > best_cutVal) {
         idx1.push_back(idx_orig[exi]);
         onesCnt_a1 += (pXD_->trn_labl[idx_orig[exi].first] > 0.5) ? idx_orig[exi].second : 0;
         cnt_a1 += idx_orig[exi].second;
      } else {
         idx0.push_back(idx_orig[exi]);
         onesCnt_a0 += (pXD_->trn_labl[idx_orig[exi].first] > 0.5) ? idx_orig[exi].second : 0;
         cnt_a0 += idx_orig[exi].second;
      }
   }
   //double ls1=1, ls2=2; // laplace smoothing to avoid infinite values
   double ls1=0, ls2=0; // laplace smoothing to avoid infinite values
   if(cnt_a0 > 0) pNode->class_a0 = ((double)onesCnt_a0 + ls1)/((double)cnt_a0 + ls2);
   else pNode->class_a0 = 0.5;
   if(cnt_a1 > 0) pNode->class_a1 = ((double)onesCnt_a1 + ls1)/((double)cnt_a1 + ls2);
   else pNode->class_a1 = 0.5;
   pNode->exCnt = cnt_a0 + cnt_a1;
   /*
   cerr << pre << idx_orig.size() << "," << best_attr << "(" << pXD_->attr_class[best_attr] << ")," << best_ig << "," 
        << idx0.size() << "," << pNode->class_a0 << "," 
        << idx1.size() << "," << pNode->class_a1 << endl;
   */
   {
   int ls1=0, ls2=0;
   if(best_ig > 0 or (feature_sample_cnt < good_features_.size() and progressive_factor_ > 1)) {
      if(idx0.size() > 1 and onesCnt_a0 != 0 and onesCnt_a0 != cnt_a0) {
         pNode->node0 = new Node;
         recAssignNode(pNode->node0, idx0, pre + "  ", depth+1, node_cnt, tree_gini);
      } else {
         if(cnt_a0 > 0) {
            node_cnt++;
            tree_gini += (double)(cnt_a0*2*(cnt_a0-onesCnt_a0+ls1)*(onesCnt_a0+ls1)) / (double)((cnt_a0 + ls2)*(cnt_a0 + ls2));
         }
      }
      if(idx1.size() > 1 and onesCnt_a1 != 0 and onesCnt_a1 != cnt_a1) {
         pNode->node1 = new Node;
         recAssignNode(pNode->node1, idx1, pre + "  ", depth+1, node_cnt, tree_gini);
      } else {
         if(cnt_a1 > 0) {
            node_cnt++;
            tree_gini += (double)(cnt_a1*2*(cnt_a1-onesCnt_a1+ls1)*(onesCnt_a1+ls1)) / (double)((cnt_a1 + ls2)*(cnt_a1+ls2));
         }
      }
   } else {
      if(cnt_a0 > 0) {
         node_cnt += 1;
         tree_gini += (double)(cnt_a0*2*(cnt_a0-onesCnt_a0+ls1)*(onesCnt_a0+ls1)) / (double)((cnt_a0 + ls2)*(cnt_a0+ls2));
      }
      if(cnt_a1 > 0) {
         node_cnt += 1;
         tree_gini += (double)(cnt_a1*2*(cnt_a1-onesCnt_a1+ls1)*(onesCnt_a1+ls1)) / (double)((cnt_a1 + ls2)*(cnt_a1+ls2));
      }
   }
   }
}