Пример #1
0
 void printSubsetsHelper(vector<vector<int>> *v , vector<int> * curSubSet, vector<int> * bag) {
   
     while (bag->size() > 0) {
         
         vector<int> newBag(bag->begin(), bag->end());
         newBag.erase(newBag.begin());
     
         
         curSubSet->push_back( bag->at(0) );
         v->push_back(*curSubSet);
         printSubsetsHelper(v, curSubSet, &newBag);
         curSubSet->pop_back();
         
         bag->erase(bag->begin());
     
     }
 }
Пример #2
0
//Downloads the data (train, validation, test sets) into memory following specifications 
//in the attr file.
//filenames may be empty strings, if correspondent data is not provided
INDdata::INDdata(const char* trainFName, const char* validFName, const char* testFName,
	const char* attrFName, bool doOut)
{
	LogStream clog;

	//read attr file, collect info about boolean attributes and attrN
	clog << "Reading the attribute file: \"" << attrFName << "\"\n";
	fstream fattr;
	fattr.open(attrFName, ios_base::in);
	if(!fattr) 
		throw OPEN_ATTR_ERR;

	char buf[LINE_LEN];	//buffer for reading from input files
	getLineExt(fattr, buf);

	//read list of attributes, collect information about them
	int attrId, colNo; // counters
	string tarName; //name of the response attribute
	bool foundClass = false;	//response found flag
	weightColNo = -1;
	for(attrId = 0, colNo = 0; fattr.gcount(); attrId++, colNo++)
	{
		string attrStr(buf);	//a line of an attr file (corresponds to 1 attribute)
		
		//check for response attribute
		if(attrStr.find("(class)") != string::npos)	
		{
			if(foundClass)
				throw MULT_CLASS_ERR;

			tarColNo = colNo;
			attrId--;
			foundClass = true;

			string::size_type nameLen = attrStr.find(":");
			tarName = attrStr.substr(0, nameLen);

			getLineExt(fattr, buf);
			continue;
		}
		if(attrStr.find("(weight)") != string::npos)	
		{
			weightColNo = colNo;
			attrId--;

			getLineExt(fattr, buf);
			continue;
		}

		//parse attr name
		string::size_type nameLen = attrStr.find(":");
		if((attrStr.find("contexts") != -1) || (nameLen == -1)) 
			break; //end of listed attributes
		string attrName = attrStr.substr(0, nameLen);
		if(attrName.find_first_of("\\/*?\"<>|:") != string::npos)
			throw ATTR_NAME_DEF_ERR;
		attrNames.push_back(trimSpace(attrName));

		//parse attr type
		string::size_type endType = attrStr.find(".");
		string typeStr = attrStr.substr(nameLen + 1, endType - nameLen - 1);
		typeStr = trimSpace(typeStr);
		if(typeStr.compare("0,1") == 0)
			boolAttrs.insert(attrId);
		else if(typeStr.compare("nom") == 0)
			nomAttrs.insert(attrId);
		else if(attrStr.find("cont") == string::npos) 
			throw ATTR_TYPE_ERR;

		getLineExt(fattr, buf);
	}
	attrN = attrId;
	colN = colNo;
	if(!foundClass)
		throw NO_CLASS_ERR;
	
	//read contexts part (if any), add unused attributes into ignoreattrs
	while(fattr.gcount())
	{
		string attrStr(buf);
		if(attrStr.find(" never") != string::npos)
		{//extract name of the attribute, find its number, insert it into ignoreattrs
			int nameLen = (int)attrStr.find(" ");
			string attrName = attrStr.substr(0, nameLen);
			attrName = trimSpace(attrName);
			int neverAttrId = getAttrId(attrName);
			if (neverAttrId == -1)
				clog << "\nWARNING: trying to exclude \"" << attrName << "\" - this is not a valid feature\n\n";
			else
				ignoreAttrs.insert(neverAttrId);
		}
		getLineExt(fattr, buf);
	}
	fattr.close();
	
	int activeAttrN = attrN - (int)ignoreAttrs.size();
	clog << attrN << " attributes\n" << activeAttrN << " active attributes\n\n";
	if(!isSubset(nomAttrs, ignoreAttrs))
		throw NOM_ACTIVE_ERR;

	//Read data
	if(string(trainFName).compare("") != 0)
	{//Read train set
		clog << "Reading the train set: \"" << trainFName << "\"\n";
		fstream fin;
		fin.open(trainFName, ios_base::in);
		if(fin.fail()) 
			throw OPEN_TRAIN_ERR;
		 
		hasMV = false;
		getLineExt(fin, buf);
		int caseNo;
		for(caseNo = 0; fin.gcount(); caseNo++)
		{//read one line of data file, save class value in targets, attribute values in data
			if(doOut && ((caseNo + 1)% 100000 == 0))
				cout << "\tRead " << caseNo + 1 << " lines..." << endl;
			
			floatv item;	//single data point
			try {
				readData(buf, fin.gcount(), item, colN);
			} catch (TE_ERROR err) {
				cerr << "\nLine " << caseNo + 1 << "\n";
				throw err;
			}
			
			trainTar.push_back(item[tarColNo]);
			if(weightColNo != -1)
				trainW.push_back(item[weightColNo]);
			item.erase(item.begin() + max(tarColNo, weightColNo));
			if(weightColNo != -1)
				item.erase(item.begin() + min(tarColNo, weightColNo));

			for(intset::iterator boolIt = boolAttrs.begin(); boolIt != boolAttrs.end(); boolIt++)
				if((item[*boolIt] != 0) && (item[*boolIt] != 1) && !wxisNaN(item[*boolIt]))
					throw ATTR_NOT_BOOL_ERR;
			train.push_back(item);
			getLineExt(fin, buf);
		}
		trainN = caseNo;
		trainV = trainN;
		if(trainN == 0)
			throw TRAIN_EMPTY_ERR;
		if(weightColNo != -1)
		{
			double trainSum = 0;
			trainR.resize(trainN);
			for(int itemNo = 0; itemNo < trainN; itemNo++)
				trainSum += trainW[itemNo];
			double trCoef = trainN / trainSum;
			for(int itemNo = 0; itemNo < trainN; itemNo++)
			{
				trainW[itemNo] *= trCoef;
				trainR[itemNo] = (itemNo == 0) ? trainW[itemNo] : trainW[itemNo] + trainR[itemNo - 1];
			}

		}
		double trainStD = getTarStD(TRAIN);
		clog << trainN << " points in the train set, std. dev. of " << tarName << " values = " << trainStD 
			<< "\n\n"; 
		fin.close();

		//initialize bootstrap (bag of data)
		bootstrap.resize(trainN); 
		newBag();	
	}
	else //no train set
		trainN = 0;

	if(string(validFName).compare("") != 0)
	{//Read validation set
		clog << "Reading the validation set: \"" << validFName << "\"\n";
		fstream fvalid;
		fvalid.open(validFName, ios_base::in); 
		if(fvalid.fail())
			throw OPEN_VALID_ERR;

		getLineExt(fvalid, buf);
		int caseNo;
		for(caseNo=0; fvalid.gcount(); caseNo++)
		{//read one line of data file, save response value in validtar, attributes values in valid
			if (doOut && ((caseNo + 1) % 100000 == 0))
				cout << "\tRead " << caseNo + 1 << " lines..." << endl;
			
			floatv item;	//single data point
			try {
				readData(buf, fvalid.gcount(), item, colN);
			} catch (TE_ERROR err) {
				cerr << "\nLine " << caseNo + 1 << "\n";
				throw err;
			}

			validTar.push_back(item[tarColNo]);
			if(weightColNo != -1)
				validW.push_back(item[weightColNo]);
			item.erase(item.begin() + max(tarColNo, weightColNo));
			if(weightColNo != -1)
				item.erase(item.begin() + min(tarColNo, weightColNo));

			valid.push_back(item);
			getLineExt(fvalid, buf);
		}
		validN = caseNo;
		if(validN == 0)
			throw VALID_EMPTY_ERR;
		double validStD = getTarStD(VALID);
		clog << validN << " points in the validation set, std. dev. of " << tarName << " values = " 
			<< validStD << "\n\n"; 
		fvalid.close();
	}
	else	//no validation set
		validN = 0;

	if(string(testFName).compare("") != 0)
	{//Read test set
		clog << "Reading the test set: \"" << testFName << "\"\n";
		fstream ftest;
		ftest.open(testFName, ios_base::in); 
		if(ftest.fail()) 
			throw OPEN_TEST_ERR;

		getLineExt(ftest, buf);
		int caseNo;
		for(caseNo=0; ftest.gcount(); caseNo++)
		{//read one line of data file, save response value in testtar, attributes in test
			if (doOut && ((caseNo + 1) % 100000 == 0))
				cout << "\tRead " << caseNo + 1 << " lines...\n";

			floatv item;	//single data point
			try {
				readData(buf, ftest.gcount(), item, colN);
			} catch (TE_ERROR err) {
				cerr << "\nLine " << caseNo + 1 << "\n";
				throw err;
			}

			testTar.push_back(item[tarColNo]);
			if(weightColNo != -1)
				testW.push_back(item[weightColNo]);
			item.erase(item.begin() + max(tarColNo, weightColNo));
			if(weightColNo != -1)
				item.erase(item.begin() + min(tarColNo, weightColNo));

			test.push_back(item);
			getLineExt(ftest, buf);
		}
		testN = caseNo;
		double testStD = getTarStD(TEST);
		clog << testN << " points in the test set, std. dev. of " << tarName << " values = " << testStD 
			<< "\n\n";
		ftest.close();
	}
	else	//no test set
		testN = 0;
}