void DecisionTree::computeTree(Pool& pool) { double max =0; int varChosen=0; std::vector<int> occurrencesInResp(pool.levelSize()); for(int j=0;j<pool.levelSize();j++) occurrencesInResp[j] = pool.idxs(j).size(); double firstTerm = entropy(occurrencesInResp,pool.sampleSize()); for(int i=0;i<matComp.size();i++) { double calc = infoGain(matComp[i],pool,firstTerm); if(calc>=max) { max = calc; varChosen = i; } } if(max>0) { pool.setVarIdx(varChosen); pool.setInfoGain(max); int splits = matComp[varChosen].levelSize(); for(int j=0;j<splits;j++) { std::vector<int> descIdxs = matComp[varChosen].idxs(j); std::vector<int> respIdxs = pool.allIdxs(); std::vector<int> intersec; intersec.reserve(pool.sampleSize()); std::set_intersection(descIdxs.begin(),descIdxs.end(),respIdxs.begin(), respIdxs.end(),std::back_inserter(intersec)); pool.add_node(Pool(intersec,pool.labsFromIdx(intersec),matComp[varChosen].getLevel(j))); Pool& poolToCheck = pool.getNext(j); if(poolToCheck.levelSize()<=1) continue; else computeTree(poolToCheck); } } }
void RandomForest::recAssignNode(Node* pNode, vector<pair<int,int> >& idx_orig, string pre, int depth, int& node_cnt, double& tree_gini) { double best_ig=-1; int best_attr=-1; bool rand_best_f = false; double best_cutVal=0.5; vector<vector<int> > best_a_l_cnt(2,vector<int>(2,0)); vector<pair<int,int> > idx0, idx1; idx0.empty(); idx1.empty(); pNode->node0 = NULL; pNode->node1 = NULL; vector<int> features; int feature_sample_cnt = (int)((double)base_feature_sample_cnt_ * pow(progressive_factor_,depth)); if(feature_sample_cnt > good_features_.size()) feature_sample_cnt = good_features_.size(); create_feature_list(features, feature_sample_cnt, good_features_); if(debug && 0) { int last=-1; for(int i=0; i<features.size(); i++) { if(features[i] == last) cerr << endl << "ERROR: repeat feature" << endl; last = features[i]; cerr << features[i] << ","; } cerr << endl; } vector<vector<int> > a_l_cnt(2,vector<int>(2,0)); double cutVal; double ig=-1; int approach = 1; vector<f_ig_cut> vBestF; int step_size = (int)(0.01*(double)idx_orig.size()); if(step_size < 1) step_size = 1; for(int attr_idx = 0; attr_idx < feature_sample_cnt; attr_idx++) { //double cutVal = (double)((rand() % 1000)+1)/(double)(1001); //cerr << features[attr_idx] << ":" << pXD_->attr_class[features[attr_idx]] << ":" << pXD_->attr_class.size() << endl; if(pXD_->attr_class[features[attr_idx]] == 1 and approach == 1) { double best_ig = -1; vector<pair<int,double> > attr_sort_idx(idx_orig.size(),pair<int,double>(0,0.0)); a_l_cnt.assign(2,vector<int>(2,0)); for(int exi = 0; exi < idx_orig.size(); exi++) { attr_sort_idx[exi] = pair<int,double>(exi,pXD_->trn_attr[idx_orig[exi].first][features[attr_idx]]); a_l_cnt[1][(int)pXD_->trn_labl[idx_orig[exi].first]] += idx_orig[exi].second; } sort(attr_sort_idx.begin(),attr_sort_idx.end(),idx_double_sort); int ex_id, ex_id_next, ex_cnt, labl; double attr_val; for(int exii = 0; exii < (attr_sort_idx.size()-step_size); exii++) { ex_id = idx_orig[attr_sort_idx[exii].first].first; ex_id_next = idx_orig[attr_sort_idx[exii+step_size].first].first; ex_cnt = idx_orig[attr_sort_idx[exii].first].second; labl = (int)pXD_->trn_labl[ex_id]; attr_val = pXD_->trn_attr[ex_id][features[attr_idx]]; a_l_cnt[0][labl] += ex_cnt; a_l_cnt[1][labl] -= ex_cnt; if (pXD_->trn_attr[ex_id_next][features[attr_idx]] == attr_val || (exii % step_size) != 0) continue; ig = infoGain(a_l_cnt); if(ig>best_ig) { best_ig = ig; cutVal = attr_val; //be sure to cut at <= cutVal or > cutVal } } ig = best_ig; } else { a_l_cnt.assign(2,vector<int>(2,0)); if(approach == 1 || approach == 2) { cutVal = pXD_->attr_avg[features[attr_idx]]; } else { cutVal = (double)((rand() % 1000)+1)/(double)(1001); } int attr_split; for(int exi = 0; exi < idx_orig.size(); exi++) { attr_split = (pXD_->trn_attr[idx_orig[exi].first][features[attr_idx]] > cutVal) ? 1 : 0; a_l_cnt[attr_split][(int)pXD_->trn_labl[idx_orig[exi].first]] += idx_orig[exi].second; } ig = infoGain(a_l_cnt); } if(rand_best_f) { f_ig_cut newcut; newcut.ig = ig; newcut.attr = features[attr_idx]; newcut.cutVal = cutVal; newcut.a_l_cnt = a_l_cnt; vBestF.push_back(newcut); } if(ig > best_ig) { best_ig = ig; best_attr = features[attr_idx]; best_cutVal = cutVal; best_a_l_cnt = a_l_cnt; } //cerr << features[attr_idx] << "," << pXD_->attr_class[features[attr_idx]] << "," << ig << endl; } if(rand_best_f) { sort(vBestF.begin(), vBestF.end(), f_ig_cut_sort); int id = (int)(pow((double)(rand() % 1000)/1000.0,11) * 3 /*vBestF.size()*/); //cerr << id << endl; //cerr << vBestF[0].ig << "," << best_ig << endl; best_ig = vBestF[id].ig; best_attr = vBestF[id].attr; best_cutVal = vBestF[id].cutVal; best_a_l_cnt = vBestF[id].a_l_cnt; } pNode->ig = best_ig; pNode->attr_id = best_attr; pNode->cutVal = best_cutVal; pNode->a_l_cnt = best_a_l_cnt; int cnt_a1=0, cnt_a0=0, onesCnt_a1=0, onesCnt_a0=0; for(int exi = 0; exi < idx_orig.size(); exi++) { if(pXD_->trn_attr[idx_orig[exi].first][best_attr] > best_cutVal) { idx1.push_back(idx_orig[exi]); onesCnt_a1 += (pXD_->trn_labl[idx_orig[exi].first] > 0.5) ? idx_orig[exi].second : 0; cnt_a1 += idx_orig[exi].second; } else { idx0.push_back(idx_orig[exi]); onesCnt_a0 += (pXD_->trn_labl[idx_orig[exi].first] > 0.5) ? idx_orig[exi].second : 0; cnt_a0 += idx_orig[exi].second; } } //double ls1=1, ls2=2; // laplace smoothing to avoid infinite values double ls1=0, ls2=0; // laplace smoothing to avoid infinite values if(cnt_a0 > 0) pNode->class_a0 = ((double)onesCnt_a0 + ls1)/((double)cnt_a0 + ls2); else pNode->class_a0 = 0.5; if(cnt_a1 > 0) pNode->class_a1 = ((double)onesCnt_a1 + ls1)/((double)cnt_a1 + ls2); else pNode->class_a1 = 0.5; pNode->exCnt = cnt_a0 + cnt_a1; /* cerr << pre << idx_orig.size() << "," << best_attr << "(" << pXD_->attr_class[best_attr] << ")," << best_ig << "," << idx0.size() << "," << pNode->class_a0 << "," << idx1.size() << "," << pNode->class_a1 << endl; */ { int ls1=0, ls2=0; if(best_ig > 0 or (feature_sample_cnt < good_features_.size() and progressive_factor_ > 1)) { if(idx0.size() > 1 and onesCnt_a0 != 0 and onesCnt_a0 != cnt_a0) { pNode->node0 = new Node; recAssignNode(pNode->node0, idx0, pre + " ", depth+1, node_cnt, tree_gini); } else { if(cnt_a0 > 0) { node_cnt++; tree_gini += (double)(cnt_a0*2*(cnt_a0-onesCnt_a0+ls1)*(onesCnt_a0+ls1)) / (double)((cnt_a0 + ls2)*(cnt_a0 + ls2)); } } if(idx1.size() > 1 and onesCnt_a1 != 0 and onesCnt_a1 != cnt_a1) { pNode->node1 = new Node; recAssignNode(pNode->node1, idx1, pre + " ", depth+1, node_cnt, tree_gini); } else { if(cnt_a1 > 0) { node_cnt++; tree_gini += (double)(cnt_a1*2*(cnt_a1-onesCnt_a1+ls1)*(onesCnt_a1+ls1)) / (double)((cnt_a1 + ls2)*(cnt_a1+ls2)); } } } else { if(cnt_a0 > 0) { node_cnt += 1; tree_gini += (double)(cnt_a0*2*(cnt_a0-onesCnt_a0+ls1)*(onesCnt_a0+ls1)) / (double)((cnt_a0 + ls2)*(cnt_a0+ls2)); } if(cnt_a1 > 0) { node_cnt += 1; tree_gini += (double)(cnt_a1*2*(cnt_a1-onesCnt_a1+ls1)*(onesCnt_a1+ls1)) / (double)((cnt_a1 + ls2)*(cnt_a1+ls2)); } } } }