/* * Returns a vector of integers containing the counts * of all the various values of an attribute/column. */ vi countDistinct(vvs &table, int column) { vs vectorOfStrings; vi counts; bool found = false; int foundIndex; for (int iii = 1; iii < table.size(); iii++) { for (int jjj = 0; jjj < vectorOfStrings.size(); jjj++) { if (vectorOfStrings[jjj] == table[iii][column]) { found = true; foundIndex = jjj; break; } else { found = false; } } if (!found) { counts.push_back(1); vectorOfStrings.push_back(table[iii][column]); } else { counts[foundIndex]++; } } int sum = 0; for (int iii = 0; iii < counts.size(); iii++) { sum += counts[iii]; } counts.push_back(sum); return counts; }
/* * Prunes a table based on a column/attribute's name * and value of that attribute. Removes that column * and all rows that have that value for that column. */ vvs pruneTable(vvs &attributeTable, string &colName, string value) { int iii, jjj; vvs prunedTable; int column = -1; vs headerRow; for (iii = 0; iii < attributeTable[0].size(); iii++) { if (attributeTable[0][iii] == colName) { column = iii; break; } } for (iii = 0; iii < attributeTable[0].size(); iii++) { if (iii != column) { headerRow.push_back(attributeTable[0][iii]); } } prunedTable.push_back(headerRow); for (iii = 0; iii < attributeTable.size(); iii++) { vs auxRow; if (attributeTable[iii][column] == value) { for (jjj = 0; jjj < attributeTable[iii].size(); jjj++) { if (jjj != column) { auxRow.push_back(attributeTable[iii][jjj]); } } prunedTable.push_back(auxRow); } } return prunedTable; }
/* * Returns a vvs which contains information about * the data table. The vvs contains the names of * all the columns and the values that each * column can take */ vvs generateTableInfo(vvs &dataTable) { vvs tableInfo; #pragma omp parallel { #pragma omp for schedule(static) ordered for (int iii = 0; iii < dataTable[0].size(); iii++) { vs tempInfo; msi tempMap; for (int jjj = 0; jjj < dataTable.size(); jjj++) { if (tempMap.count(dataTable[jjj][iii]) == 0) { tempMap[dataTable[jjj][iii]] = 1; tempInfo.push_back(dataTable[jjj][iii]); } else { tempMap[dataTable[jjj][iii]]++; } } #pragma omp ordered tableInfo.push_back(tempInfo); } } return tableInfo; }
/* * Prints a vector of vector of strings * For debugging purposes only. */ void printAttributeTable(vvs &attributeTable) { int inner, outer; for (outer = 0; outer < attributeTable.size(); outer++) { for (inner = 0; inner < attributeTable[outer].size(); inner++) { cout << attributeTable[outer][inner] << "\t"; } cout << endl; } }
/* * Returns an integer which is the * index of a column passed as a string */ int returnColumnIndex(string &columnName, vvs &tableInfo) { int iii; for (iii = 0; iii < tableInfo.size(); iii++) { if (tableInfo[iii][0] == columnName) { return iii; } } return -1; }
/* * function: naive_bayes::set_data 设置数据集,产生辅助数据 * d: 数据集 * h: 属性, 需保证最后一列为目标属性,且为离散值 * b: 属性是离散值(false), 还是数值型(true) * */ bool naive_bayes::set_data(vvs& d, vs& h, vb b) { bool f = clear(); if(!f) return f; assert(d.size() > 0); // 数据集不能为空 datas = d; headers = h; num_attr = (int)headers.size(); num_data = (int)d.size(); is_numeric = b; is_numeric.resize(num_attr, false); assert(is_numeric.back() == false); // 目标属性必须为离散值 target_attr = headers.back(); attr_to_int.resize(num_attr); int_to_attr.resize(num_attr); attrs_size.resize(num_attr); for(int i = 0; i < num_data; ++i) { auto& e = d[i]; for(int j = 0; j < num_attr; ++j) { if(is_numeric[j]) continue; // 数值型数据不需要映射 auto it = attr_to_int[j].find(e[j]); if(it == attr_to_int[j].end()) { attr_to_int[j][e[j]] = (int)int_to_attr[j].size(); int_to_attr[j].push_back(e[j]); } } } for(int i = 0; i < num_attr; ++i) attrs_size[i] = (int)int_to_attr[i].size(); num_targ = attrs_size.back(); // 目标属性值的下标 for(int i = 0; i < num_data; ++i) target_to_label.push_back(attr_to_int[num_attr - 1][d[i][num_attr - 1]]); return true; }
bool check(vvs graph, size_t node_number, size_t color_number) { size_t num_nodes = graph.size(); vector<size_t> colors(num_nodes, 0); bool fail = false; set<size_t> blue; set<size_t> red; if (color_number == 1) { blue.insert(node_number); colors[node_number] = 1; } if (color_number == 2) { red.insert(node_number); colors[node_number] = 2; } while (!fail && (!blue.empty() || !red.empty())) { set<size_t>::iterator it; set<size_t>* target; set<size_t>* orig; if (!blue.empty()) { it = blue.begin(); target = &red; orig = &blue; } else { it = red.begin(); target = &blue; orig = &red; } size_t node = *it; orig->erase(it); size_t target_color; if (colors[node] == 1) target_color = 2; if (colors[node] == 2) target_color = 1; for (size_t n_counter = 0; !fail && n_counter < graph[node].size(); n_counter++) { size_t n_num = graph[node][n_counter]; if (colors[n_num] && colors[n_num] != target_color) fail = true; else if (!colors[n_num]) { colors[n_num] = target_color; target->insert(n_num); } } } return !fail; }
/* * Returns true if all rows in a subtable * have the same class label. * This means that that node's class label * has been decided. */ bool isHomogeneous(vvs &table) { int iii; int lastCol = table[0].size() - 1; string firstValue = table[1][lastCol]; for (iii = 1; iii < table.size(); iii++) { if (firstValue != table[iii][lastCol]) { return false; } } return true; }
void printCase(vs V, vvs A, vd f){ printf("Case Details:\n"); printf("--------------\n"); printf("Number Of Variables n: %d\n\n", V.size()); printf("Rules:\n"); printf("-------\n"); for(int i=0; i<A.size(); i++){ for(int j=0; j<A[i].size(); j++){ printf("%s ", A[i][j].c_str()); } printf(":: %.3lf\n", f[i]); } }
/* * Decides which column to split on * based on entropy. Returns the column * with the least entropy. */ string decideSplittingColumn(vvs &table) { int column, iii; double minEntropy = DBL_MAX; int splittingColumn = 0; vi entropies; #pragma omp parallel { #pragma omp for for (column = 0; column < table[0].size() - 1; column++) { string colName = table[0][column]; msi tempMap; vi counts = countDistinct(table, column); vd attributeEntropy; double columnEntropy = 0.0; for (iii = 1; iii < table.size() - 1; iii++) { double entropy = 0.0; if (tempMap.find(table[iii][column]) != tempMap.end()) { // IF ATTRIBUTE IS ALREADY FOUND IN A COLUMN, UPDATE IT'S FREQUENCY tempMap[table[iii][column]]++; } else { // IF ATTRIBUTE IS FOUND FOR THE FIRST TIME IN A COLUMN, THEN PROCESS IT AND CALCULATE IT'S ENTROPY tempMap[table[iii][column]] = 1; vvs tempTable = pruneTable(table, colName, table[iii][column]); vi classCounts = countDistinct(tempTable, tempTable[0].size() - 1); int jjj, kkk; for (jjj = 0; jjj < classCounts.size(); jjj++) { double temp = (double)classCounts[jjj]; entropy -= (temp / classCounts[classCounts.size() - 1])*(log(temp / classCounts[classCounts.size() - 1]) / log(2)); } attributeEntropy.push_back(entropy); entropy = 0.0; } } //for (iii = 0; iii < counts.size() - 1 && ((attributeEntropy.size() - 1) == (counts.size() - 1)); iii++) { for (iii = 0; iii < counts.size() - 1; iii++) { columnEntropy += ((double)counts[iii] * (double)attributeEntropy[iii]); } columnEntropy = columnEntropy / ((double)counts[counts.size() - 1]); if (columnEntropy <= minEntropy) { minEntropy = columnEntropy; splittingColumn = column; } } } return table[0][splittingColumn]; }
/* * Returns the most frequent class from the training data * This class will be used as the default class label */ string returnMostFrequentClass(vvs &dataTable) { msi trainingClasses; // Stores the classlabels and their frequency for (int iii = 1; iii < dataTable.size(); iii++) { if (trainingClasses.count(dataTable[iii][dataTable[0].size() - 1]) == 0) { trainingClasses[dataTable[iii][dataTable[0].size() - 1]] = 1; } else { trainingClasses[dataTable[iii][dataTable[0].size() - 1]]++; } } msi::iterator mapIter; int highestClassCount = 0; string mostFrequentClass; for (mapIter = trainingClasses.begin(); mapIter != trainingClasses.end(); mapIter++) { if (mapIter->second >= highestClassCount) { highestClassCount = mapIter->second; mostFrequentClass = mapIter->first; } } return mostFrequentClass; }
double recur(vs V, vvs A, vd f){ //printCase(V, A, f); if(V.size()==1) return f[0]; string var = V[0]; int val; for(int i=0; i<A.size(); i++){ if(A[i].size()>1) continue; if(var==A[i][0]) { val = f[i]; break; } } vs Vij = V; vvs Aij(A.size(), vs()); //step 1 Vij.erase(Vij.begin()); //step 2 for(int i=0; i<A.size(); i++){ for(int j=0; j<A[i].size(); j++){ if(var!=A[i][j]) Aij[i].push_back(A[i][j]); } } // remove empty rules for(int i=0; i<Aij.size(); i++){ if(Aij[i].size()==0){ Aij.erase(Aij.begin()+i); i--; } } // remove repeated rules set<vs> s(Aij.begin(), Aij.end()); Aij.clear(); Aij = vvs(s.begin(), s.end()); // construct DAG vector<vi> graph(Aij.size(), vi()); vector<vi> graphT(Aij.size(), vi()); for(int i=0; i<Aij.size(); i++){ for(int j=0; j<Aij.size(); j++){ if(i==j||Aij[i].size()>=Aij[j].size()) continue; bool subset=1; for(int k=0; k<Aij[i].size()&⊂ k++){ bool found=0; for(int l=0; l<Aij[j].size(); l++){ if(Aij[i][k]==Aij[j][l]){ found=1; break; } } subset&=found; } if(subset){ graph[i].push_back(j); graphT[j].push_back(i); } } } // Try for all values of the number of instances of the cur variable double maxValue=0; while(val>0){ vd fij(Aij.size(), INF); for(int i=0; i<Aij.size(); i++){ vs S = Aij[i]; vs Sv = S; Sv.push_back(var); bool foundS=false, foundSv=false; double foundSval, foundSvVal; for(int j=0; j<A.size(); j++){ if(S.size()==A[j].size()){ bool sameSet=1; for(int k=0; k<S.size()&&sameSet; k++){ bool found=0; for(int l=0; l<S.size(); l++){ if(S[k]==A[j][l]) { found = 1; break; } } sameSet&=found; } if(sameSet) foundS=true, foundSval=f[j]; } if(Sv.size()==A[j].size()){ bool sameSet=1; for(int k=0; k<Sv.size()&&sameSet; k++){ bool found=0; for(int l=0; l<Sv.size(); l++){ if(Sv[k]==A[j][l]) { found = 1; break; } } sameSet&=found; } if(sameSet) foundSv=true, foundSvVal=f[j]/val; } } if(!foundSv) fij[i]=foundSval; else if(!foundS) fij[i]=foundSvVal; else fij[i]=min(foundSval,foundSvVal); } queue<int> q; for(int i=0; i<Aij.size(); i++){ if(graph[i].size()==0) q.push(i); } while(!q.empty()){ int node = q.front(); q.pop(); for(int i=0; i<graph[node].size(); i++){ fij[node]=min(fij[node], fij[graph[node][i]]); } for(int i=0; i<graphT[node].size(); i++){ q.push(graphT[node][i]); } } maxValue=max(maxValue, val*recur(Vij, Aij, fij)); val--; } return maxValue; }
/* * Returns true if the table is empty * returns false otherwise */ bool tableIsEmpty(vvs &table) { return (table.size() == 1); }