Beispiel #1
0
void crossValidate(GArgReader& args)
{
	// Parse options
	unsigned int seed = getpid() * (unsigned int)time(NULL);
	size_t folds = 2;
	while(args.next_is_flag())
	{
		if(args.if_pop("-seed"))
			seed = args.pop_uint();
		else if(args.if_pop("-folds"))
			folds = args.pop_uint();
		else
			ThrowError("Invalid crossvalidate option: ", args.peek());
	}
	if(folds < 2)
		ThrowError("There must be at least 2 folds.");

	// Load the data
	if(args.size() < 1)
		ThrowError("No dataset specified.");
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);

	// Instantiate the recommender
	GRand prng(seed);
	GCollaborativeFilter* pModel = InstantiateAlgorithm(prng, args);
	Holder<GCollaborativeFilter> hModel(pModel);
	if(args.size() > 0)
		ThrowError("Superfluous argument: ", args.peek());

	// Do cross-validation
	double mae;
	double mse = pModel->crossValidate(*pData, folds, &mae);
	cout << "RMSE=" << sqrt(mse) << ", MSE=" << mse << ", MAE=" << mae << "\n";
}
Beispiel #2
0
void GRecommenderLib::transacc(GArgReader& args)
{
	// Parse options
	unsigned int seed = getpid() * (unsigned int)time(NULL);
	while(args.next_is_flag())
	{
		if(args.if_pop("-seed"))
			seed = args.pop_uint();
		else
			throw Ex("Invalid crossvalidate option: ", args.peek());
	}

	// Load the data
	if(args.size() < 1)
		throw Ex("No training set specified.");
	GMatrix train;
	loadData(train, args.pop_string());
	if(args.size() < 1)
		throw Ex("No test set specified.");
	GMatrix test;
	loadData(test, args.pop_string());

	// Instantiate the recommender
	GCollaborativeFilter* pModel = InstantiateAlgorithm(args);
	std::unique_ptr<GCollaborativeFilter> hModel(pModel);
	if(args.size() > 0)
		throw Ex("Superfluous argument: ", args.peek());
	pModel->rand().setSeed(seed);

	// Do cross-validation
	double mae;
	double mse = pModel->trainAndTest(train, test, &mae);
	cout << "MSE=" << mse << ", MAE=" << mae << "\n";
}
Beispiel #3
0
void precisionRecall(GArgReader& args)
{
	// Parse options
	unsigned int seed = getpid() * (unsigned int)time(NULL);
	bool ideal = false;
	while(args.next_is_flag())
	{
		if(args.if_pop("-seed"))
			seed = args.pop_uint();
		else if(args.if_pop("-ideal"))
			ideal = true;
		else
			ThrowError("Invalid option: ", args.peek());
	}

	// Load the data
	if(args.size() < 1)
		ThrowError("No dataset specified.");
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);

	// Instantiate the recommender
	GRand prng(seed);
	GCollaborativeFilter* pModel = InstantiateAlgorithm(prng, args);
	Holder<GCollaborativeFilter> hModel(pModel);
	if(args.size() > 0)
		ThrowError("Superfluous argument: ", args.peek());

	// Generate precision-recall data
	GMatrix* pResults = pModel->precisionRecall(*pData, ideal);
	Holder<GMatrix> hResults(pResults);
	pResults->deleteColumn(2); // we don't need the false-positive rate column
	pResults->print(cout);
}
Beispiel #4
0
void transacc(GArgReader& args)
{
	// Parse options
	unsigned int seed = getpid() * (unsigned int)time(NULL);
	while(args.next_is_flag())
	{
		if(args.if_pop("-seed"))
			seed = args.pop_uint();
		else
			ThrowError("Invalid crossvalidate option: ", args.peek());
	}

	// Load the data
	if(args.size() < 1)
		ThrowError("No training set specified.");
	GMatrix* pTrain = loadData(args.pop_string());
	Holder<GMatrix> hTrain(pTrain);
	if(args.size() < 1)
		ThrowError("No test set specified.");
	GMatrix* pTest = loadData(args.pop_string());
	Holder<GMatrix> hTest(pTest);

	// Instantiate the recommender
	GRand prng(seed);
	GCollaborativeFilter* pModel = InstantiateAlgorithm(prng, args);
	Holder<GCollaborativeFilter> hModel(pModel);
	if(args.size() > 0)
		ThrowError("Superfluous argument: ", args.peek());

	// Do cross-validation
	double mae;
	double mse = pModel->trainAndTest(*pTrain, *pTest, &mae);
	cout << "MSE=" << mse << ", MAE=" << mae << "\n";
}
Beispiel #5
0
void GRecommenderLib::precisionRecall(GArgReader& args)
{
	// Parse options
	unsigned int seed = getpid() * (unsigned int)time(NULL);
	bool ideal = false;
	while(args.next_is_flag())
	{
		if(args.if_pop("-seed"))
			seed = args.pop_uint();
		else if(args.if_pop("-ideal"))
			ideal = true;
		else
			throw Ex("Invalid option: ", args.peek());
	}

	// Load the data
	if(args.size() < 1)
		throw Ex("No dataset specified.");
	GMatrix data;
	loadData(data, args.pop_string());

	// Instantiate the recommender
	GCollaborativeFilter* pModel = InstantiateAlgorithm(args);
	std::unique_ptr<GCollaborativeFilter> hModel(pModel);
	if(args.size() > 0)
		throw Ex("Superfluous argument: ", args.peek());
	pModel->rand().setSeed(seed);

	// Generate precision-recall data
	GMatrix* pResults = pModel->precisionRecall(data, ideal);
	std::unique_ptr<GMatrix> hResults(pResults);
	pResults->deleteColumns(2, 1); // we don't need the false-positive rate column
	pResults->print(cout);
}
Beispiel #6
0
void lle(GArgReader& args)
{
	// Load the file and params
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	unsigned int nSeed = getpid() * (unsigned int)time(NULL);
	GRand prng(nSeed);
	GNeighborFinder* pNF = instantiateNeighborFinder(pData, &prng, args);
	Holder<GNeighborFinder> hNF(pNF);
	int targetDims = args.pop_uint();

	// Parse Options
	while(args.size() > 0)
	{
		if(args.if_pop("-seed"))
			prng.setSeed(args.pop_uint());
		else
			throw Ex("Invalid option: ", args.peek());
	}

	// Transform the data
	GLLE transform(pNF->neighborCount(), targetDims, &prng);
	transform.setNeighborFinder(pNF);
	GMatrix* pDataAfter = transform.doit(*pData);
	Holder<GMatrix> hDataAfter(pDataAfter);
	pDataAfter->print(cout);
}
Beispiel #7
0
void showError(GArgReader& args, const char* szAppName, const char* szMessage)
{
	cerr << "_________________________________\n";
	cerr << szMessage << "\n\n";
	args.set_pos(1);
	const char* szCommand = args.peek();
	UsageNode* pUsageTree = makeClusterUsageTree();
	std::unique_ptr<UsageNode> hUsageTree(pUsageTree);
	if(szCommand)
	{
		UsageNode* pUsageCommand = pUsageTree->choice(szCommand);
		if(pUsageCommand)
		{
			cerr << "Brief Usage Information:\n\n";
			cerr << szAppName << " ";
			pUsageCommand->print(cerr, 0, 3, 76, 1000, true);
		}
		else
		{
			cerr << "Brief Usage Information:\n\n";
			pUsageTree->print(cerr, 0, 3, 76, 1, false);
		}
	}
	else
	{
		pUsageTree->print(cerr, 0, 3, 76, 1, false);
		cerr << "\nFor more specific usage information, enter as much of the command as you know.\n";
	}
	cerr << "\nTo see full usage information, run:\n	" << szAppName << " usage\n\n";
	cerr << "For a graphical tool that will help you to build a command, run:\n	waffles_wizard\n";
	cerr.flush();
}
Beispiel #8
0
void breadthFirstUnfolding(GArgReader& args)
{
	// Load the file and params
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	size_t nSeed = getpid() * (unsigned int)time(NULL);
	GRand prng(nSeed);
	GNeighborFinder* pNF = instantiateNeighborFinder(pData, &prng, args);
	Holder<GNeighborFinder> hNF(pNF);
	int targetDims = args.pop_uint();

	// Parse Options
	size_t reps = 1;
	Holder<GMatrix> hControlData(NULL);
	while(args.size() > 0)
	{
		if(args.if_pop("-seed"))
			nSeed = args.pop_uint();
		else if(args.if_pop("-reps"))
			reps = args.pop_uint();
		else
			throw Ex("Invalid option: ", args.peek());
	}

	// Transform the data
	GBreadthFirstUnfolding transform(reps, pNF->neighborCount(), targetDims);
	transform.rand().setSeed(nSeed);
	transform.setNeighborFinder(pNF);
	GMatrix* pDataAfter = transform.reduce(*pData);
	Holder<GMatrix> hDataAfter(pDataAfter);
	pDataAfter->print(cout);
}
Beispiel #9
0
void showInstantiateNeighborFinderError(const char* szMessage, GArgReader& args)
{
	cerr << "_________________________________\n";
	cerr << szMessage << "\n\n";
	const char* szNFName = args.peek();
	UsageNode* pNFTree = makeNeighborUsageTree();
	Holder<UsageNode> hNFTree(pNFTree);
	if(szNFName)
	{
		UsageNode* pUsageAlg = pNFTree->choice(szNFName);
		if(pUsageAlg)
		{
			cerr << "Partial Usage Information:\n\n";
			pUsageAlg->print(cerr, 0, 3, 76, 1000, true);
		}
		else
		{
			cerr << "\"" << szNFName << "\" is not a recognized neighbor-finding techniqie. Try one of these:\n\n";
			pNFTree->print(cerr, 0, 3, 76, 1, false);
		}
	}
	else
	{
		cerr << "Expected a neighbor-finding technique. Here are some choices:\n";
		pNFTree->print(cerr, 0, 3, 76, 1, false);
	}
	cerr << "\nTo see full usage information, run:\n	waffles_transform usage\n\n";
	cerr << "For a graphical tool that will help you to build a command, run:\n	waffles_wizard\n";
	cerr.flush();
}
Beispiel #10
0
void correlation(GArgReader& args)
{
	GMatrix* pA = loadData(args.pop_string());
	Holder<GMatrix> hA(pA);
	int attr1 = args.pop_uint();
	int attr2 = args.pop_uint();

	// Parse Options
	bool aboutorigin = false;
	while(args.size() > 0)
	{
		if(args.if_pop("-aboutorigin"))
			aboutorigin = true;
		else
			ThrowError("Invalid option: ", args.peek());
	}

	double m1, m2;
	if(aboutorigin)
	{
		m1 = 0;
		m2 = 0;
	}
	else
	{
		m1 = pA->mean(attr1);
		m2 = pA->mean(attr2);
	}
	double corr = pA->linearCorrelationCoefficient(attr1, m1, attr2, m2);
	cout.precision(14);
	cout << corr << "\n";
}
Beispiel #11
0
void AddIndexAttribute(GArgReader& args)
{
	// Parse args
	const char* filename = args.pop_string();
	double nStartValue = 0.0;
	double nIncrement = 1.0;
	while(args.size() > 0)
	{
		if(args.if_pop("-start"))
			nStartValue = args.pop_double();
		else if(args.if_pop("-increment"))
			nIncrement = args.pop_double();
		else
			ThrowError("Invalid option: ", args.peek());
	}

	GMatrix* pData = loadData(filename);
	Holder<GMatrix> hData(pData);
	GArffRelation* pIndexRelation = new GArffRelation();
	pIndexRelation->addAttribute("index", 0, NULL);
	sp_relation pIndexRel = pIndexRelation;
	GMatrix indexes(pIndexRel);
	indexes.newRows(pData->rows());
	for(size_t i = 0; i < pData->rows(); i++)
		indexes.row(i)[0] = nStartValue + i * nIncrement;
	GMatrix* pUnified = GMatrix::mergeHoriz(&indexes, pData);
	Holder<GMatrix> hUnified(pUnified);
	pUnified->print(cout);
}
Beispiel #12
0
void isomap(GArgReader& args)
{
	// Load the file and params
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	unsigned int nSeed = getpid() * (unsigned int)time(NULL);
	GRand prng(nSeed);
	GNeighborFinder* pNF = instantiateNeighborFinder(pData, &prng, args);
	Holder<GNeighborFinder> hNF(pNF);
	int targetDims = args.pop_uint();

	// Parse Options
	bool tolerant = false;
	while(args.size() > 0)
	{
		if(args.if_pop("-seed"))
			prng.setSeed(args.pop_uint());
		else if(args.if_pop("-tolerant"))
			tolerant = true;
		else
			throw Ex("Invalid option: ", args.peek());
	}

	// Transform the data
	GIsomap transform(pNF->neighborCount(), targetDims, &prng);
	transform.setNeighborFinder(pNF);
	if(tolerant)
		transform.dropDisconnectedPoints();
	GMatrix* pDataAfter = transform.reduce(*pData);
	Holder<GMatrix> hDataAfter(pDataAfter);
	pDataAfter->print(cout);
}
Beispiel #13
0
void fillMissingValues(GArgReader& args)
{
	// Load
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);

	// Parse options
	unsigned int nSeed = getpid() * (unsigned int)time(NULL);
	bool random = false;
	while(args.size() > 0)
	{
		if(args.if_pop("-seed"))
			nSeed = args.pop_uint();
		else if(args.if_pop("-random"))
			random = true;
		else
			ThrowError("Invalid option: ", args.peek());
	}

	// Replace missing values and print
	GRand prng(nSeed);
	if(random)
	{
		for(size_t i = 0; i < pData->relation()->size(); i++)
			pData->replaceMissingValuesRandomly(i, &prng);
	}
	else
	{
		for(size_t i = 0; i < pData->relation()->size(); i++)
			pData->replaceMissingValuesWithBaseline(i);
	}
	pData->print(cout);
}
Beispiel #14
0
void kmeans(GArgReader& args)
{
	// Load the file and params
	GMatrix data;
	loadData(data, args.pop_string());
	int clusters = args.pop_uint();

	// Parse Options
	unsigned int nSeed = getpid() * (unsigned int)time(NULL);
	size_t reps = 1;
	while(args.size() > 0)
	{
		if(args.if_pop("-seed"))
			nSeed = args.pop_uint();
		else if(args.if_pop("-reps"))
			reps = args.pop_uint();
		else
			throw Ex("Invalid option: ", args.peek());
	}

	// Do the clustering
	GRand prng(nSeed);
	GKMeans clusterer(clusters, &prng);
	clusterer.setReps(reps);
	GMatrix* pOut = clusterer.reduce(data);
	std::unique_ptr<GMatrix> hOut(pOut);
	pOut->print(cout);
}
Beispiel #15
0
void splitClass(GArgReader& args)
{
	const char* filename = args.pop_string();
	GMatrix* pData = loadData(filename);
	Holder<GMatrix> hData(pData);
	size_t classAttr = args.pop_uint();
	
	bool dropClass = false;
	while(args.size() > 0)
	{
		if(args.if_pop("-dropclass"))
			dropClass = true;
		else
			ThrowError("Invalid option: ", args.peek());
	}

	for(size_t i = 0; i < pData->relation()->valueCount(classAttr); i++)
	{
		GMatrix tmp(pData->relation(), pData->heap());
		pData->splitByNominalValue(&tmp, classAttr, i);
		std::ostringstream oss;
		PathData pd;
		GFile::parsePath(filename, &pd);
		string fn;
		fn.assign(filename + pd.fileStart, pd.extStart - pd.fileStart);
		oss << fn << "_";
		pData->relation()->printAttrValue(oss, classAttr, (double)i);
		oss << ".arff";
		string s = oss.str();
		if(dropClass)
			tmp.deleteColumn(classAttr);
		tmp.saveArff(s.c_str());
	}
}
Beispiel #16
0
void split(GArgReader& args)
{
	// Load
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	int pats = (int)pData->rows() - args.pop_uint();
	if(pats < 0)
		ThrowError("out of range. The data only has ", to_str(pData->rows()), " rows.");
	const char* szFilename1 = args.pop_string();
	const char* szFilename2 = args.pop_string();

	unsigned int nSeed = getpid() * (unsigned int)time(NULL);
	bool shouldShuffle = false;
	while(args.size() > 0){
		if(args.if_pop("-shuffle")){
			shouldShuffle = true;
		}else if(args.if_pop("-seed")){
			nSeed = args.pop_uint();
		}else
			ThrowError("Invalid option: ", args.peek());
	}

	// Shuffle if necessary
	GRand rng(nSeed);
	if(shouldShuffle){
		pData->shuffle(rng);
	}

	// Split
	GMatrix other(pData->relation());
	pData->splitBySize(&other, pats);
	pData->saveArff(szFilename1);
	other.saveArff(szFilename2);
}
Beispiel #17
0
void nominalToCat(GArgReader& args)
{
	// Load the file
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);

	// Parse Options
	int maxValues = 12;
	while(args.size() > 0)
	{
		if(args.if_pop("-maxvalues"))
			maxValues = args.pop_uint();
		else
			ThrowError("Invalid option: ", args.peek());
	}

	// Transform the data
	GNominalToCat transform(maxValues);
	transform.train(*pData);
	GMatrix* pDataNew = transform.transformBatch(*pData);
	Holder<GMatrix> hDataNew(pDataNew);

	// Print results
	pDataNew->print(cout);
}
Beispiel #18
0
void showInstantiateAlgorithmError(const char* szMessage, GArgReader& args)
{
	cerr << "_________________________________\n";
	cerr << szMessage << "\n\n";
	const char* szAlgName = args.peek();
	UsageNode* pAlgTree = makeCollaborativeFilterUsageTree();
	Holder<UsageNode> hAlgTree(pAlgTree);
	if(szAlgName)
	{
		UsageNode* pUsageAlg = pAlgTree->choice(szAlgName);
		if(pUsageAlg)
		{
			cerr << "Partial Usage Information:\n\n";
			pUsageAlg->print(cerr, 0, 3, 76, 1000, true);
		}
		else
		{
			cerr << "\"" << szAlgName << "\" is not a recognized algorithm. Try one of these:\n\n";
			pAlgTree->print(cerr, 0, 3, 76, 1, false);
		}
	}
	else
	{
		cerr << "Expected an algorithm. Here are some choices:\n";
		pAlgTree->print(cerr, 0, 3, 76, 1, false);
	}
	cerr << "\nTo see full usage information, run:\n	waffles_learn usage\n\n";
	cerr << "For a graphical tool that will help you to build a command, run:\n	waffles_wizard\n";
	cerr.flush();
}
Beispiel #19
0
void addNoise(GArgReader& args)
{
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	double dev = args.pop_double();

	// Parse the options
	unsigned int seed = getpid() * (unsigned int)time(NULL);
	int excludeLast = 0;
	while(args.next_is_flag())
	{
		if(args.if_pop("-seed"))
			seed = args.pop_uint();
		else if(args.if_pop("-excludelast"))
			excludeLast = args.pop_uint();
		else
			ThrowError("Invalid neighbor finder option: ", args.peek());
	}

	GRand prng(seed);
	size_t cols = pData->cols() - excludeLast;
	for(size_t r = 0; r < pData->rows(); r++)
	{
		double* pRow = pData->row(r);
		for(size_t c = 0; c < cols; c++)
			*(pRow++) += dev * prng.normal();
	}
	pData->print(cout);
}
Beispiel #20
0
void curviness2(GArgReader& args)
{
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	GNormalize norm;
	GMatrix* pDataNormalized = norm.doit(*pData);
	Holder<GMatrix> hDataNormalized(pDataNormalized);
	hData.reset();
	pData = NULL;

	// Parse Options
	size_t maxEigs = 10;
	unsigned int seed = getpid() * (unsigned int)time(NULL);
	Holder<GMatrix> hControlData(NULL);
	while(args.size() > 0)
	{
		if(args.if_pop("-seed"))
			seed = args.pop_uint();
		else if(args.if_pop("-maxeigs"))
			maxEigs = args.pop_uint();
		else
			throw Ex("Invalid option: ", args.peek());
	}

	GRand rand(seed);
	size_t targetDims = std::min(maxEigs, pDataNormalized->cols());

	// Do linear PCA
	GNeuroPCA np1(targetDims, &rand);
	np1.setActivation(new GActivationIdentity());
	np1.computeEigVals();
	GMatrix* pResults1 = np1.doit(*pDataNormalized);
	Holder<GMatrix> hResults1(pResults1);
	double* pEigVals1 = np1.eigVals();
	for(size_t i = 0; i + 1 < targetDims; i++)
		pEigVals1[i] = sqrt(pEigVals1[i]) - sqrt(pEigVals1[i + 1]);
	size_t max1 = GVec::indexOfMax(pEigVals1, targetDims - 1, &rand);
	double v1 = (double)max1;
	if(max1 > 0 && max1 + 2 < targetDims)
		v1 += (pEigVals1[max1 - 1] - pEigVals1[max1 + 1]) / (2.0 * (pEigVals1[max1 - 1] + pEigVals1[max1 + 1] - 2.0 * pEigVals1[max1]));

	// Do non-linear PCA
	GNeuroPCA np2(targetDims, &rand);
	np1.setActivation(new GActivationLogistic());
	np2.computeEigVals();
	GMatrix* pResults2 = np2.doit(*pDataNormalized);
	Holder<GMatrix> hResults2(pResults2);
	double* pEigVals2 = np2.eigVals();
	for(size_t i = 0; i + 1 < targetDims; i++)
		pEigVals2[i] = sqrt(pEigVals2[i]) - sqrt(pEigVals2[i + 1]);
	size_t max2 = GVec::indexOfMax(pEigVals2, targetDims - 1, &rand);
	double v2 = (double)max2;
	if(max2 > 0 && max2 + 2 < targetDims)
		v2 += (pEigVals2[max2 - 1] - pEigVals2[max2 + 1]) / (2.0 * (pEigVals2[max2 - 1] + pEigVals2[max2 + 1] - 2.0 * pEigVals2[max2]));

	// Compute the difference in where the eigenvalues fall
	cout.precision(14);
	cout << (v1 - v2) << "\n";
}
Beispiel #21
0
void ManifoldSculpting(GArgReader& args)
{
	// Load the file and params
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	unsigned int nSeed = getpid() * (unsigned int)time(NULL);
	GRand prng(nSeed);
	GNeighborFinder* pNF = instantiateNeighborFinder(pData, &prng, args);
	Holder<GNeighborFinder> hNF(pNF);
	size_t targetDims = args.pop_uint();

	// Parse Options
	const char* szPreprocessedData = NULL;
	double scaleRate = 0.999;
	while(args.size() > 0)
	{
		if(args.if_pop("-seed"))
			prng.setSeed(args.pop_uint());
		else if(args.if_pop("-continue"))
			szPreprocessedData = args.pop_string();
		else if(args.if_pop("-scalerate"))
			scaleRate = args.pop_double();
		else
			throw Ex("Invalid option: ", args.peek());
	}

	// Load the hint data
	GMatrix* pDataHint = NULL;
	Holder<GMatrix> hDataHint(NULL);
	if(szPreprocessedData)
	{
		pDataHint = loadData(szPreprocessedData);
		hDataHint.reset(pDataHint);
		if(pDataHint->relation()->size() != targetDims)
			throw Ex("Wrong number of dims in the hint data");
		if(pDataHint->rows() != pData->rows())
			throw Ex("Wrong number of patterns in the hint data");
	}

	// Transform the data
	GManifoldSculpting transform(pNF->neighborCount(), targetDims, &prng);
	transform.setSquishingRate(scaleRate);
	if(pDataHint)
		transform.setPreprocessedData(hDataHint.release());
	transform.setNeighborFinder(pNF);
	GMatrix* pDataAfter = transform.doit(*pData);
	Holder<GMatrix> hDataAfter(pDataAfter);
	pDataAfter->print(cout);
}
Beispiel #22
0
void singularValueDecomposition(GArgReader& args)
{
	// Load
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);

	// Parse options
	string ufilename = "u.arff";
	string sigmafilename;
	string vfilename = "v.arff";
	int maxIters = 100;
	while(args.size() > 0)
	{
		if(args.if_pop("-ufilename"))
			ufilename = args.pop_string();
		else if(args.if_pop("-sigmafilename"))
			sigmafilename = args.pop_string();
		else if(args.if_pop("-vfilename"))
			vfilename = args.pop_string();
		else if(args.if_pop("-maxiters"))
			maxIters = args.pop_uint();
		else
			ThrowError("Invalid option: ", args.peek());
	}

	GMatrix* pU;
	double* pDiag;
	GMatrix* pV;
	pData->singularValueDecomposition(&pU, &pDiag, &pV, false, maxIters);
	Holder<GMatrix> hU(pU);
	ArrayHolder<double> hDiag(pDiag);
	Holder<GMatrix> hV(pV);
	pU->saveArff(ufilename.c_str());
	pV->saveArff(vfilename.c_str());
	if(sigmafilename.length() > 0)
	{
		GMatrix sigma(pU->rows(), pV->rows());
		sigma.setAll(0.0);
		size_t m = std::min(sigma.rows(), (size_t)sigma.cols());
		for(size_t i = 0; i < m; i++)
			sigma.row(i)[i] = pDiag[i];
		sigma.saveArff(sigmafilename.c_str());
	}
	else
	{
		GVec::print(cout, 14, pDiag, std::min(pU->rows(), pV->rows()));
		cout << "\n";
	}
}
Beispiel #23
0
void dropRandomValues(GArgReader& args)
{
	GMatrix* pData = loadData(args.pop_string());
	double portion = args.pop_double();

	// Parse the options
	unsigned int seed = getpid() * (unsigned int)time(NULL);
	while(args.next_is_flag())
	{
		if(args.if_pop("-seed"))
			seed = args.pop_uint();
		else
			ThrowError("Invalid option: ", args.peek());
	}

	GRand rand(seed);
	size_t n = pData->rows() * pData->cols();
	size_t k = size_t(portion * n);
	for(size_t i = 0; i < pData->cols(); i++)
	{
		size_t vals = pData->relation()->valueCount(i);
		if(vals == 0)
		{
			for(size_t j = 0; j < pData->rows(); j++)
			{
				if(rand.next(n) < k)
				{
					pData->row(j)[i] = UNKNOWN_REAL_VALUE;
					k--;
				}
				n--;
			}
		}
		else
		{
			for(size_t j = 0; j < pData->rows(); j++)
			{
				if(rand.next(n) < k)
				{
					pData->row(j)[i] = UNKNOWN_DISCRETE_VALUE;
					k--;
				}
				n--;
			}
		}
	}
	pData->print(cout);
}
Beispiel #24
0
void blendEmbeddings(GArgReader& args)
{
	// Load the files and params
	GMatrix* pDataOrig = loadData(args.pop_string());
	Holder<GMatrix> hDataOrig(pDataOrig);
	unsigned int seed = getpid() * (unsigned int)time(NULL);
	GRand prng(seed);
	GNeighborFinder* pNF = instantiateNeighborFinder(pDataOrig, &prng, args);
	Holder<GNeighborFinder> hNF(pNF);
	GMatrix* pDataA = loadData(args.pop_string());
	Holder<GMatrix> hDataA(pDataA);
	GMatrix* pDataB = loadData(args.pop_string());
	Holder<GMatrix> hDataB(pDataB);
	if(pDataA->rows() != pDataOrig->rows() || pDataB->rows() != pDataOrig->rows())
		throw Ex("mismatching number of rows");
	if(pDataA->cols() != pDataB->cols())
		throw Ex("mismatching number of cols");

	// Parse Options
	while(args.size() > 0)
	{
		if(args.if_pop("-seed"))
			prng.setSeed(args.pop_uint());
		else
			throw Ex("Invalid option: ", args.peek());
	}

	// Get a neighbor table
	if(!pNF->isCached())
	{
		GNeighborFinderCacheWrapper* pNF2 = new GNeighborFinderCacheWrapper(hNF.release(), true);
		hNF.reset(pNF2);
		pNF = pNF2;
	}
	((GNeighborFinderCacheWrapper*)pNF)->fillCache();
	size_t* pNeighborTable = ((GNeighborFinderCacheWrapper*)pNF)->cache();

	// Do the blending
	size_t startPoint = (size_t)prng.next(pDataA->rows());
	double* pRatios = new double[pDataA->rows()];
	ArrayHolder<double> hRatios(pRatios);
	GVec::setAll(pRatios, 0.5, pDataA->rows());
	GMatrix* pDataC = GManifold::blendEmbeddings(pDataA, pRatios, pDataB, pNF->neighborCount(), pNeighborTable, startPoint);
	Holder<GMatrix> hDataC(pDataC);
	pDataC->print(cout);
}
Beispiel #25
0
void attributeSelector(GArgReader& args)
{
	// Load the data
	size_t labelDims;
	std::vector<size_t> originalIndices;
	GMatrix data;
	loadDataWithSwitches(data, args, labelDims, originalIndices);

	// Parse the options
	unsigned int seed = getpid() * (unsigned int)time(NULL);
	int targetFeatures = 1;
	string outFilename = "";
	while(args.next_is_flag())
	{
		if(args.if_pop("-seed"))
			seed = args.pop_uint();
		else if(args.if_pop("-out"))
		{
			targetFeatures = args.pop_uint();
			outFilename = args.pop_string();
		}
		else
			throw Ex("Invalid neighbor finder option: ", args.peek());
	}

	// Do the attribute selection
	GRand prng(seed);
	GAttributeSelector as(labelDims, targetFeatures, &prng);
	if(outFilename.length() > 0)
	{
		as.train(data);
		GMatrix* pDataOut = as.transformBatch(data);
		Holder<GMatrix> hDataOut(pDataOut);
		cout << "Reduced data saved to " << outFilename.c_str() << ".\n";
		pDataOut->saveArff(outFilename.c_str());
	}
	else
		as.train(data);
	cout << "\nAttribute rankings from most salient to least salient. (Attributes are zero-indexed.)\n";
	GArffRelation* pRel = (GArffRelation*)data.relation().get();
	for(size_t i = 0; i < as.ranks().size(); i++)
	  cout << originalIndices.at(as.ranks()[i]) << " " << pRel->attrName(as.ranks()[i]) << "\n";
}
Beispiel #26
0
void Discretize(GArgReader& args)
{
	// Load the file
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);

	// Parse Options
	size_t nFirst = 0;
	size_t nLast = pData->relation()->size() - 1;
	size_t nBuckets = std::max(2, (int)floor(sqrt((double)pData->rows() + 0.5)));
	while(args.size() > 0)
	{
		if(args.if_pop("-buckets"))
			nBuckets = args.pop_uint();
		else if(args.if_pop("-colrange"))
		{
			nFirst = args.pop_uint();
			nLast = args.pop_uint();
		}
		else
			ThrowError("Invalid option: ", args.peek());
	}
	if(nFirst < 0 || nLast >= pData->relation()->size() || nLast < nFirst)
		ThrowError("column index out of range");

	// Discretize the continuous attributes in the specified range
	for(size_t i = nFirst; i <= nLast; i++)
	{
		if(pData->relation()->valueCount(i) != 0)
			continue;
		double min, range;
		pData->minAndRange(i, &min, &range);
		for(size_t j = 0; j < pData->rows(); j++)
		{
			double* pPat = pData->row(j);
			pPat[i] = (double)std::max((size_t)0, std::min(nBuckets - 1, (size_t)floor(((pPat[i] - min) * nBuckets) / range)));
		}
		((GArffRelation*)pData->relation().get())->setAttrValueCount(i, nBuckets);
	}

	// Print results
	pData->print(cout);
}
Beispiel #27
0
void multiDimensionalScaling(GArgReader& args)
{
	GRand prng(0);
	GMatrix* pDistances = loadData(args.pop_string());
	int targetDims = args.pop_uint();

	// Parse Options
	bool useSquaredDistances = false;
	while(args.size() > 0)
	{
		if(args.if_pop("-squareddistances"))
			useSquaredDistances = true;
		else
			throw Ex("Invalid option: ", args.peek());
	}

	GMatrix* pResults = GManifold::multiDimensionalScaling(pDistances, targetDims, &prng, useSquaredDistances);
	Holder<GMatrix> hResults(pResults);
	pResults->print(cout);
}
Beispiel #28
0
void Shuffle(GArgReader& args)
{
	// Load
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);

	// Parse options
	unsigned int nSeed = getpid() * (unsigned int)time(NULL);
	while(args.size() > 0)
	{
		if(args.if_pop("-seed"))
			nSeed = args.pop_uint();
		else
			ThrowError("Invalid option: ", args.peek());
	}

	// Shuffle and print
	GRand prng(nSeed);
	pData->shuffle(prng);
	pData->print(cout);
}
Beispiel #29
0
void transition(GArgReader& args)
{
	// Load the input data
	GMatrix* pActions = loadData(args.pop_string());
	Holder<GMatrix> hActions(pActions);
	GMatrix* pState = loadData(args.pop_string());
	Holder<GMatrix> hState(pState);
	if(pState->rows() != pActions->rows())
		ThrowError("Expected the same number of rows in both datasets");

	// Parse options
	bool delta = false;
	while(args.size() > 0)
	{
		if(args.if_pop("-delta"))
			delta = true;
		else
			ThrowError("Invalid option: ", args.peek());
	}

	// Make the output data
	size_t actionDims = pActions->cols();
	size_t stateDims = pState->cols();
	GMixedRelation* pRelation = new GMixedRelation();
	sp_relation pRel = pRelation;
	pRelation->addAttrs(pActions->relation().get());
	pRelation->addAttrs(stateDims + stateDims, 0);
	GMatrix* pTransition = new GMatrix(pRel);
	pTransition->newRows(pActions->rows() - 1);
	for(size_t i = 0; i < pActions->rows() - 1; i++)
	{
		double* pOut = pTransition->row(i);
		GVec::copy(pOut, pActions->row(i), actionDims);
		GVec::copy(pOut + actionDims, pState->row(i), stateDims);
		GVec::copy(pOut + actionDims + stateDims, pState->row(i + 1), stateDims);
		if(delta)
			GVec::subtract(pOut + actionDims + stateDims, pState->row(i), stateDims);
	}
	pTransition->print(cout);
}
Beispiel #30
0
void Import(GArgReader& args)
{
	// Load the file
	size_t len;
	const char* filename = args.pop_string();
	char* pFile = GFile::loadFile(filename, &len);
	ArrayHolder<char> hFile(pFile);

	// Parse Options
	char separator = ',';
	bool tolerant = false;
	bool columnNamesInFirstRow = false;
	while(args.size() > 0)
	{
		if(args.if_pop("-tab"))
			separator = '\t';
		else if(args.if_pop("-space"))
			separator = ' ';
		else if(args.if_pop("-whitespace"))
			separator = '\0';
		else if(args.if_pop("-semicolon"))
			separator = ';';
		else if(args.if_pop("-separator"))
			separator = args.pop_string()[0];
		else if(args.if_pop("-tolerant"))
			tolerant = true;
		else if(args.if_pop("-columnnames"))
			columnNamesInFirstRow = true;
		else
			ThrowError("Invalid option: ", args.peek());
	}

	// Parse the file
	GMatrix* pData = GMatrix::parseCsv(pFile, len, separator, columnNamesInFirstRow, tolerant);
	Holder<GMatrix> hData(pData);
	((GArffRelation*)pData->relation().get())->setName(filename);

	// Print the data
	pData->print(cout);
}