示例#1
0
/*
* Prunes a table based on a column/attribute's name
* and value of that attribute. Removes that column
* and all rows that have that value for that column.
*/
vvs pruneTable(vvs &attributeTable, string &colName, string value)
{
	int iii, jjj;
	vvs prunedTable;
	int column = -1;
	vs headerRow;
	for (iii = 0; iii < attributeTable[0].size(); iii++) {
		if (attributeTable[0][iii] == colName) {
			column = iii;
			break;
		}
	}
	for (iii = 0; iii < attributeTable[0].size(); iii++) {
		if (iii != column) {
			headerRow.push_back(attributeTable[0][iii]);
		}
	}
	prunedTable.push_back(headerRow);
	for (iii = 0; iii < attributeTable.size(); iii++) {
		vs auxRow;
		if (attributeTable[iii][column] == value) {
			for (jjj = 0; jjj < attributeTable[iii].size(); jjj++) {
				if (jjj != column) {
					auxRow.push_back(attributeTable[iii][jjj]);
				}
			}
			prunedTable.push_back(auxRow);
		}
	}
	return prunedTable;
}
示例#2
0
/*
* Returns a vvs which contains information about
* the data table. The vvs contains the names of
* all the columns and the values that each
* column can take
*/
vvs generateTableInfo(vvs &dataTable)
{
	vvs tableInfo;

#pragma omp parallel 
	{
#pragma omp for schedule(static) ordered
		for (int iii = 0; iii < dataTable[0].size(); iii++) {
			vs tempInfo;
			msi tempMap;
			for (int jjj = 0; jjj < dataTable.size(); jjj++) {
				if (tempMap.count(dataTable[jjj][iii]) == 0) {
					tempMap[dataTable[jjj][iii]] = 1;
					tempInfo.push_back(dataTable[jjj][iii]);
				}
				else	{
					tempMap[dataTable[jjj][iii]]++;
				}
			}
#pragma omp ordered
			tableInfo.push_back(tempInfo);
		}
	}
	return tableInfo;
}
示例#3
0
/*
* Returns a vector of integers containing the counts
* of all the various values of an attribute/column.
*/
vi countDistinct(vvs &table, int column)
{
	vs vectorOfStrings;
	vi counts;
	bool found = false;
	int foundIndex;
	for (int iii = 1; iii < table.size(); iii++) {
		for (int jjj = 0; jjj < vectorOfStrings.size(); jjj++) {
			if (vectorOfStrings[jjj] == table[iii][column]) {
				found = true;
				foundIndex = jjj;
				break;
			}
			else {
				found = false;
			}
		}
		if (!found) {
			counts.push_back(1);
			vectorOfStrings.push_back(table[iii][column]);
		}
		else {
			counts[foundIndex]++;
		}
	}
	int sum = 0;
	for (int iii = 0; iii < counts.size(); iii++) {
		sum += counts[iii];
	}
	counts.push_back(sum);
	return counts;
}
示例#4
0
/*
* Prints a vector of vector of strings
* For debugging purposes only.
*/
void printAttributeTable(vvs &attributeTable)
{
	int inner, outer;
	for (outer = 0; outer < attributeTable.size(); outer++) {
		for (inner = 0; inner < attributeTable[outer].size(); inner++) {
			cout << attributeTable[outer][inner] << "\t";
		}
		cout << endl;
	}
}
示例#5
0
/*
* Returns an integer which is the
* index of a column passed as a string
*/
int returnColumnIndex(string &columnName, vvs &tableInfo)
{
	int iii;
	for (iii = 0; iii < tableInfo.size(); iii++) {
		if (tableInfo[iii][0] == columnName) {
			return iii;
		}
	}
	return -1;
}
示例#6
0
/*
 * function: naive_bayes::set_data 设置数据集,产生辅助数据
 * d: 数据集
 * h: 属性, 需保证最后一列为目标属性,且为离散值
 * b: 属性是离散值(false), 还是数值型(true)
 *
 */
bool naive_bayes::set_data(vvs& d, vs& h, vb b)
{
	bool f = clear();
	if(!f) return f;
	
	assert(d.size() > 0); // 数据集不能为空
	datas      = d;
	headers    = h;
	num_attr   = (int)headers.size();
	num_data   = (int)d.size();
	is_numeric = b;
	is_numeric.resize(num_attr, false);
	assert(is_numeric.back() == false); // 目标属性必须为离散值
	target_attr = headers.back();

	attr_to_int.resize(num_attr);
	int_to_attr.resize(num_attr);
	attrs_size.resize(num_attr);

	for(int i = 0; i < num_data; ++i)
	{
		auto& e = d[i];
		for(int j = 0; j < num_attr; ++j)
		{
			if(is_numeric[j]) continue; // 数值型数据不需要映射
			auto it = attr_to_int[j].find(e[j]);
			if(it == attr_to_int[j].end())
			{
				attr_to_int[j][e[j]] = (int)int_to_attr[j].size();
				int_to_attr[j].push_back(e[j]);
			}
		}
	}
	for(int i = 0; i < num_attr; ++i)
		attrs_size[i] = (int)int_to_attr[i].size();
	num_targ = attrs_size.back();

	// 目标属性值的下标
	for(int i = 0; i < num_data; ++i)
		target_to_label.push_back(attr_to_int[num_attr - 1][d[i][num_attr - 1]]);
	return true;
}
示例#7
0
bool check(vvs graph, 
	   size_t node_number, 
	   size_t color_number) {
  size_t num_nodes = graph.size();
  vector<size_t> colors(num_nodes, 0);
  bool fail = false;
  set<size_t> blue;
  set<size_t> red;
  if (color_number == 1) {
    blue.insert(node_number);
    colors[node_number] = 1;
  }

  if (color_number == 2) {
    red.insert(node_number);
    colors[node_number] = 2;
  }

  while (!fail && (!blue.empty() || !red.empty())) {
    set<size_t>::iterator it;
    set<size_t>* target;
    set<size_t>* orig;
    if (!blue.empty()) {
      it = blue.begin();
      target = &red;
      orig = &blue;
    }
    else {
      it = red.begin();
      target = &blue;
      orig = &red;
    }
    size_t node = *it;
    orig->erase(it);
    
    size_t target_color;
    if (colors[node] == 1) target_color = 2;
    if (colors[node] == 2) target_color = 1;

    for (size_t n_counter = 0; !fail && n_counter < graph[node].size(); n_counter++) {
      size_t n_num = graph[node][n_counter];
      if (colors[n_num] && colors[n_num] != target_color) 
	fail = true;
      else
	if (!colors[n_num]) {
	  colors[n_num] = target_color;
	  target->insert(n_num);
	}
    }
	
  }

  return !fail;
}
示例#8
0
/*
* Returns true if all rows in a subtable
* have the same class label.
* This means that that node's class label
* has been decided.
*/
bool isHomogeneous(vvs &table)
{
	int iii;
	int lastCol = table[0].size() - 1;
	string firstValue = table[1][lastCol];
	for (iii = 1; iii < table.size(); iii++) {
		if (firstValue != table[iii][lastCol]) {
			return false;
		}
	}
	return true;
}
void printCase(vs V, vvs A, vd f){
    printf("Case Details:\n");
    printf("--------------\n");
    printf("Number Of Variables n: %d\n\n", V.size());
    printf("Rules:\n");
    printf("-------\n");
    for(int i=0; i<A.size(); i++){
        for(int j=0; j<A[i].size(); j++){
            printf("%s ", A[i][j].c_str());
        }
        printf(":: %.3lf\n", f[i]);
    }
}
void BuildPath(map<string, vs> &traces, vvs &pathes, vs &path, string word, string start){
    if(word == start){
        path.push_back(word);
        vs tmp = path;
        reverse(tmp.begin(), tmp.end());
        pathes.push_back(tmp);
        path.pop_back();
        return;
    }
    path.push_back(word);
    vs tmp = traces[word];
    for(int i = 0; i < tmp.size(); ++i)
        BuildPath(traces, pathes, path, tmp[i], start);
    path.pop_back();
}
示例#11
0
/*
* Decides which column to split on
* based on entropy. Returns the column
* with the least entropy.
*/
string decideSplittingColumn(vvs &table)
{
	int column, iii;
	double minEntropy = DBL_MAX;
	int splittingColumn = 0;
	vi entropies;
#pragma omp parallel
	{
#pragma omp for
		for (column = 0; column < table[0].size() - 1; column++) {
			string colName = table[0][column];
			msi tempMap;
			vi counts = countDistinct(table, column);
			vd attributeEntropy;
			double columnEntropy = 0.0;

			for (iii = 1; iii < table.size() - 1; iii++) {
				double entropy = 0.0;
				if (tempMap.find(table[iii][column]) != tempMap.end()) { 	// IF ATTRIBUTE IS ALREADY FOUND IN A COLUMN, UPDATE IT'S FREQUENCY
					tempMap[table[iii][column]]++;
				}
				else { 							// IF ATTRIBUTE IS FOUND FOR THE FIRST TIME IN A COLUMN, THEN PROCESS IT AND CALCULATE IT'S ENTROPY
					tempMap[table[iii][column]] = 1;
					vvs tempTable = pruneTable(table, colName, table[iii][column]);
					vi classCounts = countDistinct(tempTable, tempTable[0].size() - 1);
					int jjj, kkk;
					for (jjj = 0; jjj < classCounts.size(); jjj++) {
						double temp = (double)classCounts[jjj];
						entropy -= (temp / classCounts[classCounts.size() - 1])*(log(temp / classCounts[classCounts.size() - 1]) / log(2));
					}
					attributeEntropy.push_back(entropy);
					entropy = 0.0;
				}
			}
			//for (iii = 0; iii < counts.size() - 1 && ((attributeEntropy.size() - 1) == (counts.size() - 1)); iii++) {
			for (iii = 0; iii < counts.size() - 1; iii++) {
				columnEntropy += ((double)counts[iii] * (double)attributeEntropy[iii]);
			}
			columnEntropy = columnEntropy / ((double)counts[counts.size() - 1]);
			if (columnEntropy <= minEntropy) {
				minEntropy = columnEntropy;
				splittingColumn = column;
			}
		}
	}
	return table[0][splittingColumn];
}
示例#12
0
/*
* Parses a string and stores data
* into a vector of vector of strings
*/
void parse(string& someString, vvs &attributeTable)
{
	int attributeCount = 0;
	vs vectorOfStrings;
	while (someString.length() != 0 && someString.find(',') != string::npos)
	{
		size_t pos;
		string singleAttribute;
		pos = someString.find_first_of(',');
		singleAttribute = someString.substr(0, pos);
		vectorOfStrings.push_back(singleAttribute);
		someString.erase(0, pos + 1);
	}
	vectorOfStrings.push_back(someString);
	attributeTable.push_back(vectorOfStrings);
	vectorOfStrings.clear();
}
示例#13
0
/*
* Returns the most frequent class from the training data
* This class will be used as the default class label
*/
string returnMostFrequentClass(vvs &dataTable)
{
	msi trainingClasses;           													 // Stores the classlabels and their frequency
	for (int iii = 1; iii < dataTable.size(); iii++) {
		if (trainingClasses.count(dataTable[iii][dataTable[0].size() - 1]) == 0) {
			trainingClasses[dataTable[iii][dataTable[0].size() - 1]] = 1;
		}
		else {
			trainingClasses[dataTable[iii][dataTable[0].size() - 1]]++;
		}
	}
	msi::iterator mapIter;
	int highestClassCount = 0;
	string mostFrequentClass;
	for (mapIter = trainingClasses.begin(); mapIter != trainingClasses.end(); mapIter++) {
		if (mapIter->second >= highestClassCount) {
			highestClassCount = mapIter->second;
			mostFrequentClass = mapIter->first;
		}
	}
	return mostFrequentClass;
}
double recur(vs V, vvs A, vd f){
    //printCase(V, A, f);
    if(V.size()==1) return f[0];

    string var = V[0];

    int val;
    for(int i=0; i<A.size(); i++){
        if(A[i].size()>1) continue;
        if(var==A[i][0]) {
            val = f[i];
            break;
        }
    }

    vs Vij = V;
    vvs Aij(A.size(), vs());

    //step 1
    Vij.erase(Vij.begin());
    //step 2
    for(int i=0; i<A.size(); i++){
        for(int j=0; j<A[i].size(); j++){
            if(var!=A[i][j]) Aij[i].push_back(A[i][j]);
        }
    }

    // remove empty rules
    for(int i=0; i<Aij.size(); i++){
        if(Aij[i].size()==0){
            Aij.erase(Aij.begin()+i);
            i--;
        }
    }

    // remove repeated rules
    set<vs> s(Aij.begin(), Aij.end());
    Aij.clear();
    Aij = vvs(s.begin(), s.end());

    // construct DAG
    vector<vi> graph(Aij.size(), vi());
    vector<vi> graphT(Aij.size(), vi());

    for(int i=0; i<Aij.size(); i++){
        for(int j=0; j<Aij.size(); j++){
            if(i==j||Aij[i].size()>=Aij[j].size()) continue;
            bool subset=1;
            for(int k=0; k<Aij[i].size()&&subset; k++){
                bool found=0;
                for(int l=0; l<Aij[j].size(); l++){
                    if(Aij[i][k]==Aij[j][l]){
                        found=1;
                        break;
                    }
                }
                subset&=found;
            }
            if(subset){
                graph[i].push_back(j);
                graphT[j].push_back(i);
            }
        }
    }

    // Try for all values of the number of instances of the cur variable
    double maxValue=0;
    while(val>0){
        vd fij(Aij.size(), INF);

        for(int i=0; i<Aij.size(); i++){
            vs S = Aij[i];
            vs Sv = S;
            Sv.push_back(var);
            bool foundS=false, foundSv=false;
            double foundSval, foundSvVal;

            for(int j=0; j<A.size(); j++){
                if(S.size()==A[j].size()){
                    bool sameSet=1;
                    for(int k=0; k<S.size()&&sameSet; k++){
                        bool found=0;
                        for(int l=0; l<S.size(); l++){
                            if(S[k]==A[j][l]) {
                                found = 1;
                                break;
                            }
                        }
                        sameSet&=found;
                    }
                    if(sameSet) foundS=true, foundSval=f[j];
                }
                if(Sv.size()==A[j].size()){
                    bool sameSet=1;
                    for(int k=0; k<Sv.size()&&sameSet; k++){
                        bool found=0;
                        for(int l=0; l<Sv.size(); l++){
                            if(Sv[k]==A[j][l]) {
                                found = 1;
                                break;
                            }
                        }
                        sameSet&=found;
                    }
                    if(sameSet) foundSv=true, foundSvVal=f[j]/val;
                }
            }
            if(!foundSv) fij[i]=foundSval;
            else if(!foundS) fij[i]=foundSvVal;
            else fij[i]=min(foundSval,foundSvVal);
        }

        queue<int> q;
        for(int i=0; i<Aij.size(); i++){
            if(graph[i].size()==0) q.push(i);
        }
        while(!q.empty()){
            int node = q.front(); q.pop();
            for(int i=0; i<graph[node].size(); i++){
                fij[node]=min(fij[node], fij[graph[node][i]]);
            }
            for(int i=0; i<graphT[node].size(); i++){
                q.push(graphT[node][i]);
            }
        }

        maxValue=max(maxValue, val*recur(Vij, Aij, fij));
        val--;
    }

    return maxValue;
}
示例#15
0
/*
* Returns true if the table is empty
* returns false otherwise
*/
bool tableIsEmpty(vvs &table)
{
	return (table.size() == 1);
}