Ejemplo n.º 1
0
void split(GArgReader& args)
{
	// Load
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	int pats = (int)pData->rows() - args.pop_uint();
	if(pats < 0)
		ThrowError("out of range. The data only has ", to_str(pData->rows()), " rows.");
	const char* szFilename1 = args.pop_string();
	const char* szFilename2 = args.pop_string();

	unsigned int nSeed = getpid() * (unsigned int)time(NULL);
	bool shouldShuffle = false;
	while(args.size() > 0){
		if(args.if_pop("-shuffle")){
			shouldShuffle = true;
		}else if(args.if_pop("-seed")){
			nSeed = args.pop_uint();
		}else
			ThrowError("Invalid option: ", args.peek());
	}

	// Shuffle if necessary
	GRand rng(nSeed);
	if(shouldShuffle){
		pData->shuffle(rng);
	}

	// Split
	GMatrix other(pData->relation());
	pData->splitBySize(&other, pats);
	pData->saveArff(szFilename1);
	other.saveArff(szFilename2);
}
Ejemplo n.º 2
0
void nominalToCat(GArgReader& args)
{
	// Load the file
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);

	// Parse Options
	int maxValues = 12;
	while(args.size() > 0)
	{
		if(args.if_pop("-maxvalues"))
			maxValues = args.pop_uint();
		else
			ThrowError("Invalid option: ", args.peek());
	}

	// Transform the data
	GNominalToCat transform(maxValues);
	transform.train(*pData);
	GMatrix* pDataNew = transform.transformBatch(*pData);
	Holder<GMatrix> hDataNew(pDataNew);

	// Print results
	pDataNew->print(cout);
}
Ejemplo n.º 3
0
void isomap(GArgReader& args)
{
	// Load the file and params
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	unsigned int nSeed = getpid() * (unsigned int)time(NULL);
	GRand prng(nSeed);
	GNeighborFinder* pNF = instantiateNeighborFinder(pData, &prng, args);
	Holder<GNeighborFinder> hNF(pNF);
	int targetDims = args.pop_uint();

	// Parse Options
	bool tolerant = false;
	while(args.size() > 0)
	{
		if(args.if_pop("-seed"))
			prng.setSeed(args.pop_uint());
		else if(args.if_pop("-tolerant"))
			tolerant = true;
		else
			throw Ex("Invalid option: ", args.peek());
	}

	// Transform the data
	GIsomap transform(pNF->neighborCount(), targetDims, &prng);
	transform.setNeighborFinder(pNF);
	if(tolerant)
		transform.dropDisconnectedPoints();
	GMatrix* pDataAfter = transform.reduce(*pData);
	Holder<GMatrix> hDataAfter(pDataAfter);
	pDataAfter->print(cout);
}
Ejemplo n.º 4
0
void lle(GArgReader& args)
{
	// Load the file and params
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	unsigned int nSeed = getpid() * (unsigned int)time(NULL);
	GRand prng(nSeed);
	GNeighborFinder* pNF = instantiateNeighborFinder(pData, &prng, args);
	Holder<GNeighborFinder> hNF(pNF);
	int targetDims = args.pop_uint();

	// Parse Options
	while(args.size() > 0)
	{
		if(args.if_pop("-seed"))
			prng.setSeed(args.pop_uint());
		else
			throw Ex("Invalid option: ", args.peek());
	}

	// Transform the data
	GLLE transform(pNF->neighborCount(), targetDims, &prng);
	transform.setNeighborFinder(pNF);
	GMatrix* pDataAfter = transform.doit(*pData);
	Holder<GMatrix> hDataAfter(pDataAfter);
	pDataAfter->print(cout);
}
Ejemplo n.º 5
0
void splitClass(GArgReader& args)
{
	const char* filename = args.pop_string();
	GMatrix* pData = loadData(filename);
	Holder<GMatrix> hData(pData);
	size_t classAttr = args.pop_uint();
	
	bool dropClass = false;
	while(args.size() > 0)
	{
		if(args.if_pop("-dropclass"))
			dropClass = true;
		else
			ThrowError("Invalid option: ", args.peek());
	}

	for(size_t i = 0; i < pData->relation()->valueCount(classAttr); i++)
	{
		GMatrix tmp(pData->relation(), pData->heap());
		pData->splitByNominalValue(&tmp, classAttr, i);
		std::ostringstream oss;
		PathData pd;
		GFile::parsePath(filename, &pd);
		string fn;
		fn.assign(filename + pd.fileStart, pd.extStart - pd.fileStart);
		oss << fn << "_";
		pData->relation()->printAttrValue(oss, classAttr, (double)i);
		oss << ".arff";
		string s = oss.str();
		if(dropClass)
			tmp.deleteColumn(classAttr);
		tmp.saveArff(s.c_str());
	}
}
Ejemplo n.º 6
0
void breadthFirstUnfolding(GArgReader& args)
{
	// Load the file and params
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	size_t nSeed = getpid() * (unsigned int)time(NULL);
	GRand prng(nSeed);
	GNeighborFinder* pNF = instantiateNeighborFinder(pData, &prng, args);
	Holder<GNeighborFinder> hNF(pNF);
	int targetDims = args.pop_uint();

	// Parse Options
	size_t reps = 1;
	Holder<GMatrix> hControlData(NULL);
	while(args.size() > 0)
	{
		if(args.if_pop("-seed"))
			nSeed = args.pop_uint();
		else if(args.if_pop("-reps"))
			reps = args.pop_uint();
		else
			throw Ex("Invalid option: ", args.peek());
	}

	// Transform the data
	GBreadthFirstUnfolding transform(reps, pNF->neighborCount(), targetDims);
	transform.rand().setSeed(nSeed);
	transform.setNeighborFinder(pNF);
	GMatrix* pDataAfter = transform.reduce(*pData);
	Holder<GMatrix> hDataAfter(pDataAfter);
	pDataAfter->print(cout);
}
Ejemplo n.º 7
0
void curviness2(GArgReader& args)
{
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	GNormalize norm;
	GMatrix* pDataNormalized = norm.doit(*pData);
	Holder<GMatrix> hDataNormalized(pDataNormalized);
	hData.reset();
	pData = NULL;

	// Parse Options
	size_t maxEigs = 10;
	unsigned int seed = getpid() * (unsigned int)time(NULL);
	Holder<GMatrix> hControlData(NULL);
	while(args.size() > 0)
	{
		if(args.if_pop("-seed"))
			seed = args.pop_uint();
		else if(args.if_pop("-maxeigs"))
			maxEigs = args.pop_uint();
		else
			throw Ex("Invalid option: ", args.peek());
	}

	GRand rand(seed);
	size_t targetDims = std::min(maxEigs, pDataNormalized->cols());

	// Do linear PCA
	GNeuroPCA np1(targetDims, &rand);
	np1.setActivation(new GActivationIdentity());
	np1.computeEigVals();
	GMatrix* pResults1 = np1.doit(*pDataNormalized);
	Holder<GMatrix> hResults1(pResults1);
	double* pEigVals1 = np1.eigVals();
	for(size_t i = 0; i + 1 < targetDims; i++)
		pEigVals1[i] = sqrt(pEigVals1[i]) - sqrt(pEigVals1[i + 1]);
	size_t max1 = GVec::indexOfMax(pEigVals1, targetDims - 1, &rand);
	double v1 = (double)max1;
	if(max1 > 0 && max1 + 2 < targetDims)
		v1 += (pEigVals1[max1 - 1] - pEigVals1[max1 + 1]) / (2.0 * (pEigVals1[max1 - 1] + pEigVals1[max1 + 1] - 2.0 * pEigVals1[max1]));

	// Do non-linear PCA
	GNeuroPCA np2(targetDims, &rand);
	np1.setActivation(new GActivationLogistic());
	np2.computeEigVals();
	GMatrix* pResults2 = np2.doit(*pDataNormalized);
	Holder<GMatrix> hResults2(pResults2);
	double* pEigVals2 = np2.eigVals();
	for(size_t i = 0; i + 1 < targetDims; i++)
		pEigVals2[i] = sqrt(pEigVals2[i]) - sqrt(pEigVals2[i + 1]);
	size_t max2 = GVec::indexOfMax(pEigVals2, targetDims - 1, &rand);
	double v2 = (double)max2;
	if(max2 > 0 && max2 + 2 < targetDims)
		v2 += (pEigVals2[max2 - 1] - pEigVals2[max2 + 1]) / (2.0 * (pEigVals2[max2 - 1] + pEigVals2[max2 + 1] - 2.0 * pEigVals2[max2]));

	// Compute the difference in where the eigenvalues fall
	cout.precision(14);
	cout << (v1 - v2) << "\n";
}
Ejemplo n.º 8
0
void zeroMean(GArgReader& args)
{
	GMatrix* pA = loadData(args.pop_string());
	Holder<GMatrix> hA(pA);
	if(args.size() > 0)
		ThrowError("Superfluous arg: ", args.pop_string());
	pA->centerMeanAtOrigin();
	pA->print(cout);
}
Ejemplo n.º 9
0
void multiplyScalar(GArgReader& args)
{
	GMatrix* pA = loadData(args.pop_string());
	Holder<GMatrix> hA(pA);
	double scale = args.pop_double();
	if(args.size() > 0)
		ThrowError("Superfluous arg: ", args.pop_string());
	pA->multiply(scale);
	pA->print(cout);
}
Ejemplo n.º 10
0
void LoadData(GArgReader &args, std::unique_ptr<GMatrix> &hOutput)
{
	// Load the dataset by extension
	if(args.size() < 1)
		throw Ex("Expected the filename of a datset. (Found end of arguments.)");
	const char* szFilename = args.pop_string();
	PathData pd;
	GFile::parsePath(szFilename, &pd);
	GMatrix data;
	vector<size_t> abortedCols;
	vector<size_t> ambiguousCols;
	const char *input_type;
	if (args.next_is_flag() && args.if_pop("-input_type")) {
		input_type = args.pop_string();
	} else { /* deduce it from extension (if any) */
		input_type = szFilename + pd.extStart;
		if (*input_type != '.') /* no extension - assume ARFF */
			input_type = "arff";
		else
			input_type++;
	}
	
	// Now load the data
	if(_stricmp(input_type, "arff") == 0)
	{
		data.loadArff(szFilename);
	}
	else if(_stricmp(input_type, "csv") == 0)
	{
		GCSVParser parser;
		parser.parse(data, szFilename);
		cerr << "\nParsing Report:\n";
		for(size_t i = 0; i < data.cols(); i++)
			cerr << to_str(i) << ") " << parser.report(i) << "\n";
	}
	else if(_stricmp(input_type, "dat") == 0)
	{
		GCSVParser parser;
		parser.setSeparator('\0');
		parser.parse(data, szFilename);
		cerr << "\nParsing Report:\n";
		for(size_t i = 0; i < data.cols(); i++)
			cerr << to_str(i) << ") " << parser.report(i) << "\n";
	}
	else
	{
		throw Ex("Unsupported file format: ", szFilename + pd.extStart);
	}
	
	// Split data into a feature matrix and a label matrix
	GMatrix* pFeatures = data.cloneSub(0, 0, data.rows(), data.cols());
	hOutput.reset(pFeatures);
}
Ejemplo n.º 11
0
void ROC(GArgReader& args)
{
	// Parse options
	unsigned int seed = getpid() * (unsigned int)time(NULL);
	bool ideal = false;
	while(args.next_is_flag())
	{
		if(args.if_pop("-seed"))
			seed = args.pop_uint();
		else if(args.if_pop("-ideal"))
			ideal = true;
		else
			ThrowError("Invalid option: ", args.peek());
	}

	// Load the data
	if(args.size() < 1)
		ThrowError("No dataset specified.");
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);

	// Instantiate the recommender
	GRand prng(seed);
	GCollaborativeFilter* pModel = InstantiateAlgorithm(prng, args);
	Holder<GCollaborativeFilter> hModel(pModel);
	if(args.size() > 0)
		ThrowError("Superfluous argument: ", args.peek());

	// Generate ROC data
	GMatrix* pResults = pModel->precisionRecall(*pData, ideal);
	Holder<GMatrix> hResults(pResults);
	double auc = GCollaborativeFilter::areaUnderCurve(*pResults);
	pResults->deleteColumn(1); // we don't need the precision column
	pResults->swapColumns(0, 1);
	cout << "% Area Under the Curve = " << auc << "\n";
	pResults->print(cout);
}
Ejemplo n.º 12
0
void ManifoldSculpting(GArgReader& args)
{
	// Load the file and params
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	unsigned int nSeed = getpid() * (unsigned int)time(NULL);
	GRand prng(nSeed);
	GNeighborFinder* pNF = instantiateNeighborFinder(pData, &prng, args);
	Holder<GNeighborFinder> hNF(pNF);
	size_t targetDims = args.pop_uint();

	// Parse Options
	const char* szPreprocessedData = NULL;
	double scaleRate = 0.999;
	while(args.size() > 0)
	{
		if(args.if_pop("-seed"))
			prng.setSeed(args.pop_uint());
		else if(args.if_pop("-continue"))
			szPreprocessedData = args.pop_string();
		else if(args.if_pop("-scalerate"))
			scaleRate = args.pop_double();
		else
			throw Ex("Invalid option: ", args.peek());
	}

	// Load the hint data
	GMatrix* pDataHint = NULL;
	Holder<GMatrix> hDataHint(NULL);
	if(szPreprocessedData)
	{
		pDataHint = loadData(szPreprocessedData);
		hDataHint.reset(pDataHint);
		if(pDataHint->relation()->size() != targetDims)
			throw Ex("Wrong number of dims in the hint data");
		if(pDataHint->rows() != pData->rows())
			throw Ex("Wrong number of patterns in the hint data");
	}

	// Transform the data
	GManifoldSculpting transform(pNF->neighborCount(), targetDims, &prng);
	transform.setSquishingRate(scaleRate);
	if(pDataHint)
		transform.setPreprocessedData(hDataHint.release());
	transform.setNeighborFinder(pNF);
	GMatrix* pDataAfter = transform.doit(*pData);
	Holder<GMatrix> hDataAfter(pDataAfter);
	pDataAfter->print(cout);
}
Ejemplo n.º 13
0
void singularValueDecomposition(GArgReader& args)
{
	// Load
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);

	// Parse options
	string ufilename = "u.arff";
	string sigmafilename;
	string vfilename = "v.arff";
	int maxIters = 100;
	while(args.size() > 0)
	{
		if(args.if_pop("-ufilename"))
			ufilename = args.pop_string();
		else if(args.if_pop("-sigmafilename"))
			sigmafilename = args.pop_string();
		else if(args.if_pop("-vfilename"))
			vfilename = args.pop_string();
		else if(args.if_pop("-maxiters"))
			maxIters = args.pop_uint();
		else
			ThrowError("Invalid option: ", args.peek());
	}

	GMatrix* pU;
	double* pDiag;
	GMatrix* pV;
	pData->singularValueDecomposition(&pU, &pDiag, &pV, false, maxIters);
	Holder<GMatrix> hU(pU);
	ArrayHolder<double> hDiag(pDiag);
	Holder<GMatrix> hV(pV);
	pU->saveArff(ufilename.c_str());
	pV->saveArff(vfilename.c_str());
	if(sigmafilename.length() > 0)
	{
		GMatrix sigma(pU->rows(), pV->rows());
		sigma.setAll(0.0);
		size_t m = std::min(sigma.rows(), (size_t)sigma.cols());
		for(size_t i = 0; i < m; i++)
			sigma.row(i)[i] = pDiag[i];
		sigma.saveArff(sigmafilename.c_str());
	}
	else
	{
		GVec::print(cout, 14, pDiag, std::min(pU->rows(), pV->rows()));
		cout << "\n";
	}
}
Ejemplo n.º 14
0
void GRecommenderLib::crossValidate(GArgReader& args)
{
	// Parse options
	unsigned int seed = getpid() * (unsigned int)time(NULL);
	size_t folds = 2;
	while(args.next_is_flag())
	{
		if(args.if_pop("-seed"))
			seed = args.pop_uint();
		else if(args.if_pop("-folds"))
			folds = args.pop_uint();
		else
			throw Ex("Invalid crossvalidate option: ", args.peek());
	}
	if(folds < 2)
		throw Ex("There must be at least 2 folds.");

	// Load the data
	if(args.size() < 1)
		throw Ex("No dataset specified.");
	GMatrix data;
	loadData(data, args.pop_string());

	// Instantiate the recommender
	GCollaborativeFilter* pModel = InstantiateAlgorithm(args);
	std::unique_ptr<GCollaborativeFilter> hModel(pModel);
	if(args.size() > 0)
		throw Ex("Superfluous argument: ", args.peek());
	pModel->rand().setSeed(seed);

	// Do cross-validation
	double mae;
	double mse;
	mse = pModel->crossValidate(data, folds, &mae);
	cout << "RMSE=" << sqrt(mse) << ", MSE=" << mse << ", MAE=" << mae << "\n";
}
Ejemplo n.º 15
0
void mergeHoriz(GArgReader& args)
{
	GMatrix* pData1 = loadData(args.pop_string());
	Holder<GMatrix> hData1(pData1);
	GMatrix* pMerged = pData1;
	Holder<GMatrix> hMerged(NULL);
	while(args.size() > 0)
	{
		GMatrix* pData2 = loadData(args.pop_string());
		Holder<GMatrix> hData2(pData2);
		if(pMerged->rows() != pData2->rows())
			ThrowError("The datasets must have the same number of rows");
		pMerged = GMatrix::mergeHoriz(pMerged, pData2);
		hMerged.reset(pMerged);
	}
	pMerged->print(cout);
}
Ejemplo n.º 16
0
void blendEmbeddings(GArgReader& args)
{
	// Load the files and params
	GMatrix* pDataOrig = loadData(args.pop_string());
	Holder<GMatrix> hDataOrig(pDataOrig);
	unsigned int seed = getpid() * (unsigned int)time(NULL);
	GRand prng(seed);
	GNeighborFinder* pNF = instantiateNeighborFinder(pDataOrig, &prng, args);
	Holder<GNeighborFinder> hNF(pNF);
	GMatrix* pDataA = loadData(args.pop_string());
	Holder<GMatrix> hDataA(pDataA);
	GMatrix* pDataB = loadData(args.pop_string());
	Holder<GMatrix> hDataB(pDataB);
	if(pDataA->rows() != pDataOrig->rows() || pDataB->rows() != pDataOrig->rows())
		throw Ex("mismatching number of rows");
	if(pDataA->cols() != pDataB->cols())
		throw Ex("mismatching number of cols");

	// Parse Options
	while(args.size() > 0)
	{
		if(args.if_pop("-seed"))
			prng.setSeed(args.pop_uint());
		else
			throw Ex("Invalid option: ", args.peek());
	}

	// Get a neighbor table
	if(!pNF->isCached())
	{
		GNeighborFinderCacheWrapper* pNF2 = new GNeighborFinderCacheWrapper(hNF.release(), true);
		hNF.reset(pNF2);
		pNF = pNF2;
	}
	((GNeighborFinderCacheWrapper*)pNF)->fillCache();
	size_t* pNeighborTable = ((GNeighborFinderCacheWrapper*)pNF)->cache();

	// Do the blending
	size_t startPoint = (size_t)prng.next(pDataA->rows());
	double* pRatios = new double[pDataA->rows()];
	ArrayHolder<double> hRatios(pRatios);
	GVec::setAll(pRatios, 0.5, pDataA->rows());
	GMatrix* pDataC = GManifold::blendEmbeddings(pDataA, pRatios, pDataB, pNF->neighborCount(), pNeighborTable, startPoint);
	Holder<GMatrix> hDataC(pDataC);
	pDataC->print(cout);
}
Ejemplo n.º 17
0
void Discretize(GArgReader& args)
{
	// Load the file
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);

	// Parse Options
	size_t nFirst = 0;
	size_t nLast = pData->relation()->size() - 1;
	size_t nBuckets = std::max(2, (int)floor(sqrt((double)pData->rows() + 0.5)));
	while(args.size() > 0)
	{
		if(args.if_pop("-buckets"))
			nBuckets = args.pop_uint();
		else if(args.if_pop("-colrange"))
		{
			nFirst = args.pop_uint();
			nLast = args.pop_uint();
		}
		else
			ThrowError("Invalid option: ", args.peek());
	}
	if(nFirst < 0 || nLast >= pData->relation()->size() || nLast < nFirst)
		ThrowError("column index out of range");

	// Discretize the continuous attributes in the specified range
	for(size_t i = nFirst; i <= nLast; i++)
	{
		if(pData->relation()->valueCount(i) != 0)
			continue;
		double min, range;
		pData->minAndRange(i, &min, &range);
		for(size_t j = 0; j < pData->rows(); j++)
		{
			double* pPat = pData->row(j);
			pPat[i] = (double)std::max((size_t)0, std::min(nBuckets - 1, (size_t)floor(((pPat[i] - min) * nBuckets) / range)));
		}
		((GArffRelation*)pData->relation().get())->setAttrValueCount(i, nBuckets);
	}

	// Print results
	pData->print(cout);
}
Ejemplo n.º 18
0
void multiDimensionalScaling(GArgReader& args)
{
	GRand prng(0);
	GMatrix* pDistances = loadData(args.pop_string());
	int targetDims = args.pop_uint();

	// Parse Options
	bool useSquaredDistances = false;
	while(args.size() > 0)
	{
		if(args.if_pop("-squareddistances"))
			useSquaredDistances = true;
		else
			throw Ex("Invalid option: ", args.peek());
	}

	GMatrix* pResults = GManifold::multiDimensionalScaling(pDistances, targetDims, &prng, useSquaredDistances);
	Holder<GMatrix> hResults(pResults);
	pResults->print(cout);
}
Ejemplo n.º 19
0
void Shuffle(GArgReader& args)
{
	// Load
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);

	// Parse options
	unsigned int nSeed = getpid() * (unsigned int)time(NULL);
	while(args.size() > 0)
	{
		if(args.if_pop("-seed"))
			nSeed = args.pop_uint();
		else
			ThrowError("Invalid option: ", args.peek());
	}

	// Shuffle and print
	GRand prng(nSeed);
	pData->shuffle(prng);
	pData->print(cout);
}
Ejemplo n.º 20
0
void Import(GArgReader& args)
{
	// Load the file
	size_t len;
	const char* filename = args.pop_string();
	char* pFile = GFile::loadFile(filename, &len);
	ArrayHolder<char> hFile(pFile);

	// Parse Options
	char separator = ',';
	bool tolerant = false;
	bool columnNamesInFirstRow = false;
	while(args.size() > 0)
	{
		if(args.if_pop("-tab"))
			separator = '\t';
		else if(args.if_pop("-space"))
			separator = ' ';
		else if(args.if_pop("-whitespace"))
			separator = '\0';
		else if(args.if_pop("-semicolon"))
			separator = ';';
		else if(args.if_pop("-separator"))
			separator = args.pop_string()[0];
		else if(args.if_pop("-tolerant"))
			tolerant = true;
		else if(args.if_pop("-columnnames"))
			columnNamesInFirstRow = true;
		else
			ThrowError("Invalid option: ", args.peek());
	}

	// Parse the file
	GMatrix* pData = GMatrix::parseCsv(pFile, len, separator, columnNamesInFirstRow, tolerant);
	Holder<GMatrix> hData(pData);
	((GArffRelation*)pData->relation().get())->setName(filename);

	// Print the data
	pData->print(cout);
}
Ejemplo n.º 21
0
void transition(GArgReader& args)
{
	// Load the input data
	GMatrix* pActions = loadData(args.pop_string());
	Holder<GMatrix> hActions(pActions);
	GMatrix* pState = loadData(args.pop_string());
	Holder<GMatrix> hState(pState);
	if(pState->rows() != pActions->rows())
		ThrowError("Expected the same number of rows in both datasets");

	// Parse options
	bool delta = false;
	while(args.size() > 0)
	{
		if(args.if_pop("-delta"))
			delta = true;
		else
			ThrowError("Invalid option: ", args.peek());
	}

	// Make the output data
	size_t actionDims = pActions->cols();
	size_t stateDims = pState->cols();
	GMixedRelation* pRelation = new GMixedRelation();
	sp_relation pRel = pRelation;
	pRelation->addAttrs(pActions->relation().get());
	pRelation->addAttrs(stateDims + stateDims, 0);
	GMatrix* pTransition = new GMatrix(pRel);
	pTransition->newRows(pActions->rows() - 1);
	for(size_t i = 0; i < pActions->rows() - 1; i++)
	{
		double* pOut = pTransition->row(i);
		GVec::copy(pOut, pActions->row(i), actionDims);
		GVec::copy(pOut + actionDims, pState->row(i), stateDims);
		GVec::copy(pOut + actionDims + stateDims, pState->row(i + 1), stateDims);
		if(delta)
			GVec::subtract(pOut + actionDims + stateDims, pState->row(i), stateDims);
	}
	pTransition->print(cout);
}
Ejemplo n.º 22
0
void Export(GArgReader& args)
{
	// Load
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);

	// Parse options
	const char* separator = ",";
	while(args.size() > 0)
	{
		if(args.if_pop("-tab"))
			separator = "	";
		else if(args.if_pop("-space"))
			separator = " ";
		else
			ThrowError("Invalid option: ", args.peek());
	}

	// Print
	for(size_t i = 0; i < pData->rows(); i++)
		pData->relation()->printRow(cout, pData->row(i), separator);
}
Ejemplo n.º 23
0
void splitFold(GArgReader& args)
{
	// Load
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	size_t fold = args.pop_uint();
	size_t folds = args.pop_uint();
	if(fold >= folds)
		ThrowError("fold index out of range. It must be less than the total number of folds.");

	// Options
	string filenameTrain = "train.arff";
	string filenameTest = "test.arff";
	while(args.size() > 0)
	{
		if(args.if_pop("-out"))
		{
			filenameTrain = args.pop_string();
			filenameTest = args.pop_string();
		}
		else
			ThrowError("Invalid option: ", args.peek());
	}

	// Copy relevant portions of the data
	GMatrix train(pData->relation());
	GMatrix test(pData->relation());
	size_t begin = pData->rows() * fold / folds;
	size_t end = pData->rows() * (fold + 1) / folds;
	for(size_t i = 0; i < begin; i++)
		train.copyRow(pData->row(i));
	for(size_t i = begin; i < end; i++)
		test.copyRow(pData->row(i));
	for(size_t i = end; i < pData->rows(); i++)
		train.copyRow(pData->row(i));
	train.saveArff(filenameTrain.c_str());
	test.saveArff(filenameTest.c_str());
}
Ejemplo n.º 24
0
void normalize(GArgReader& args)
{
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);

	double min = 0.0;
	double max = 1.0;
	while(args.size() > 0)
	{
		if(args.if_pop("-range"))
		{
			min = args.pop_double();
			max = args.pop_double();
		}
		else
			ThrowError("Invalid option: ", args.peek());
	}

	GNormalize transform(min, max);
	transform.train(*pData);
	GMatrix* pOut = transform.transformBatch(*pData);
	Holder<GMatrix> hOut(pOut);
	pOut->print(cout);
}
Ejemplo n.º 25
0
void multiplyMatrices(GArgReader& args)
{
	GMatrix* pA = loadData(args.pop_string());
	Holder<GMatrix> hA(pA);
	GMatrix* pB = loadData(args.pop_string());
	Holder<GMatrix> hB(pB);

	// Parse Options
	bool transposeA = false;
	bool transposeB = false;
	while(args.size() > 0)
	{
		if(args.if_pop("-transposea"))
			transposeA = true;
		else if(args.if_pop("-transposeb"))
			transposeB = true;
		else
			ThrowError("Invalid option: ", args.peek());
	}

	GMatrix* pC = GMatrix::multiply(*pA, *pB, transposeA, transposeB);
	Holder<GMatrix> hC(pC);
	pC->print(cout);
}
Ejemplo n.º 26
0
void SortByAttribute(GArgReader& args)
{
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	size_t nAttr = args.pop_uint();
	size_t attrCount = pData->relation()->size();
	if(nAttr >= attrCount)
		ThrowError("Index out of range");

	// Parse options
	bool descending = false;
	while(args.size() > 0)
	{
		if(args.if_pop("-descending"))
			descending = true;
		else
			ThrowError("Invalid option: ", args.peek());
	}

	pData->sort(nAttr);
	if(descending)
		pData->reverseRows();
	pData->print(cout);
}
Ejemplo n.º 27
0
void Extrapolate(GArgReader &args)
{
	// Load the model
	if(args.size() < 1)
	{
		throw Ex("Model not specified.");
	}
	GDom doc;
	doc.loadJson(args.pop_string());
	GLearnerLoader ll(true);
	GSupervisedLearner *pLearner = ll.loadLearner(doc.root());
	std::unique_ptr<GSupervisedLearner> hLearner(pLearner);
	
	// Parse options
	
	double start = 1.0;
	double length = 1.0;
	double step = 0.0002;
	bool useFeatures = false;
	bool outputFeatures = true;
	
	GNeuralDecomposition *nd = (GNeuralDecomposition *) pLearner;
	std::unique_ptr<GMatrix> hFeatures;
	
	while(args.next_is_flag())
	{
		if(args.if_pop("-start"))
		{
			start = args.pop_double();
		}
		else if(args.if_pop("-length"))
		{
			length = args.pop_double();
		}
		else if(args.if_pop("-step"))
		{
			step = args.pop_double();
		}
		else if(args.if_pop("-features"))
		{
			LoadData(args, hFeatures);
			useFeatures = true;
		}
		else if(args.if_pop("-outputFeatures"))
		{
			outputFeatures = true;
		}
		else
		{
			throw Ex("Invalid option: ", args.peek());
		}
	}
	
	// Extrapolate
	GMatrix *pOutput;
	if(useFeatures)
		pOutput = nd->extrapolate(*hFeatures.get());
	else
		pOutput = nd->extrapolate(start, length, step, outputFeatures);
	std::unique_ptr<GMatrix> hOutput(pOutput);
	
	// Output predictions
	pOutput->print(cout);
}
Ejemplo n.º 28
0
void significance(GArgReader& args)
{
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	int attr1 = args.pop_uint();
	int attr2 = args.pop_uint();

	// Parse options
	double tolerance = 0.001;
	while(args.size() > 0)
	{
		if(args.if_pop("-tol"))
			tolerance = args.pop_double();
		else
			ThrowError("Invalid option: ", args.peek());
	}

	// Print some basic stats
	cout.precision(8);
	{
		cout << "### Some basic stats\n";
		cout << "Medians = " << pData->median(attr1) << ", " << pData->median(attr2) << "\n";
		double mean1 = pData->mean(attr1);
		double mean2 = pData->mean(attr2);
		cout << "Means = " << mean1 << ", " << mean2 << "\n";
		double var1 = pData->variance(attr1, mean1);
		double var2 = pData->variance(attr2, mean2);
		cout << "Standard deviations = " << sqrt(var1) << ", " << sqrt(var2) << "\n";
		int less = 0;
		int eq = 0;
		int more = 0;
		for(size_t i = 0; i < pData->rows(); i++)
		{
			double* pRow = pData->row(i);
			if(std::abs(pRow[attr1] - pRow[attr2]) < tolerance)
				eq++;
			else if(pRow[attr1] < pRow[attr2])
				less++;
			else
				more++;
		}
		cout << less << " less, " << eq << " same, " << more << " greater\n";
	}

	// Perform the significance tests
	{
		cout << "\n### Paired T-test\n";
		size_t v;
		double t;
		pData->pairedTTest(&v, &t, attr1, attr2, false);
		double p = GMath::tTestAlphaValue(v, t);
		cout << "v=" << v << ", t=" << t << ", p=" << p << "\n";
	}
	{
		cout << "\n### Paired T-test with normalized values\n";
		size_t v;
		double t;
		pData->pairedTTest(&v, &t, attr1, attr2, true);
		double p = GMath::tTestAlphaValue(v, t);
		cout << "v=" << v << ", t=" << t << ", p=" << p << "\n";
	}
	{
		cout << "\n### Wilcoxon Signed Ranks Test";
		int num;
		double wMinus, wPlus;
		pData->wilcoxonSignedRanksTest(attr1, attr2, tolerance, &num, &wMinus, &wPlus);
		cout << "Number of signed ranks: " << num << "\n";
		double w_min = std::min(wMinus, wPlus);
		double w_sum = wPlus - wMinus;
		cout << "W- = " << wMinus << ", W+ = " << wPlus << ", W_min = " << w_min << ", W_sum = " << w_sum << "\n";

		double p_min = 0.5 * GMath::wilcoxonPValue(num, w_min);
		if(num < 10)
			cout << "Because the number of signed ranks is small, you should use a lookup table, rather than rely on the normal approximation for the P-value.\n";
		cout << "One-tailed P-value (for directional comparisons) computed with a normal approximation using W_min = " << 0.5 * p_min << "\n";
		cout << "Two-tailed P-value (for non-directional comparisons) computed with a normal approximation using W_min = " << p_min << "\n";
		cout << "To show that something is \"better\" than something else, use the one-tailed P-value.\n";
		cout << "Commonly, a P-value less that 0.05 is considered to be significant.\n";
/*
			double p_sum = GMath::wilcoxonPValue(num, w_sum);
			cout << "Directional (one-tailed) P-value computed with W_sum = " << p_sum << "\n";
*/
	}
}
Ejemplo n.º 29
0
void fillMissingValues(GArgReader& args)
{
	unsigned int seed = getpid() * (unsigned int)time(NULL);
	while(args.next_is_flag())
	{
		if(args.if_pop("-seed"))
			seed = args.pop_uint();
		else
			ThrowError("Invalid option: ", args.peek());
	}

	// Load the data and the filter
	GMatrix* pDataOrig = GMatrix::loadArff(args.pop_string());
	Holder<GMatrix> hDataOrig(pDataOrig);
	sp_relation pOrigRel = pDataOrig->relation();
	GRand prng(seed);
	GCollaborativeFilter* pModel = InstantiateAlgorithm(prng, args);
	Holder<GCollaborativeFilter> hModel(pModel);
	if(args.size() > 0)
		ThrowError("Superfluous argument: ", args.peek());

	// Convert to all normalized real values
	GNominalToCat* pNtc = new GNominalToCat();
	GTwoWayTransformChainer filter(new GNormalize(), pNtc);
	pNtc->preserveUnknowns();
	filter.train(*pDataOrig);
	GMatrix* pData = filter.transformBatch(*pDataOrig);
	Holder<GMatrix> hData(pData);
	hDataOrig.release();
	pDataOrig = NULL;

	// Convert to 3-column form
	GMatrix* pMatrix = new GMatrix(0, 3);
	Holder<GMatrix> hMatrix(pMatrix);
	size_t dims = pData->cols();
	for(size_t i = 0; i < pData->rows(); i++)
	{
		double* pRow = pData->row(i);
		for(size_t j = 0; j < dims; j++)
		{
			if(*pRow != UNKNOWN_REAL_VALUE)
			{
				double* pVec = pMatrix->newRow();
				pVec[0] = i;
				pVec[1] = j;
				pVec[2] = *pRow;
			}
			pRow++;
		}
	}

	// Train the collaborative filter
	pModel->train(*pMatrix);
	hMatrix.release();
	pMatrix = NULL;

	// Predict values for missing elements
	for(size_t i = 0; i < pData->rows(); i++)
	{
		double* pRow = pData->row(i);
		for(size_t j = 0; j < dims; j++)
		{
			if(*pRow == UNKNOWN_REAL_VALUE)
				*pRow = pModel->predict(i, j);
			GAssert(*pRow != UNKNOWN_REAL_VALUE);
			pRow++;
		}
	}

	// Convert the data back to its original form
	GMatrix* pOut = filter.untransformBatch(*pData);
	pOut->setRelation(pOrigRel);
	pOut->print(cout);
}
Ejemplo n.º 30
0
void MeasureMeanSquaredError(GArgReader& args)
{
	// Load the first file
	GMatrix* pData1 = loadData(args.pop_string());
	Holder<GMatrix> hData1(pData1);

	// Load the second file
	GMatrix* pData2 = loadData(args.pop_string());
	Holder<GMatrix> hData2(pData2);

	// check sizes
	if(pData1->relation()->size() != pData2->relation()->size())
		ThrowError("The datasets must have the same number of dims");
	if(pData1->rows() != pData2->rows())
		ThrowError("The datasets must have the same size");

	// Parse Options
	bool fit = false;
	bool sumOverAttributes = false;
	while(args.size() > 0)
	{
		if(args.if_pop("-fit"))
			fit = true;
		else if(args.if_pop("-sum"))
			sumOverAttributes = true;
		else
			ThrowError("Invalid option: ", args.peek());
	}

	size_t dims = pData1->relation()->size();
	if(fit)
	{
		FitDataCritic critic(pData1, pData2, dims);
		GHillClimber search(&critic);

		double dPrevError;
		double dError = search.iterate();
		cerr.precision(14);
		cerr << dError << "\n";
		cerr.flush();
		while(true)
		{
			dPrevError = dError;
			for(int i = 1; i < 30; i++)
				search.iterate();
			dError = search.iterate();
			cerr << dError << "\n";
			cerr.flush();
			if((dPrevError - dError) / dPrevError < 1e-10)
				break;
		}
		critic.ShowResults(search.currentVector(), sumOverAttributes);
	}
	else
	{
		// Compute mean squared error
		GTEMPBUF(double, results, dims);
		ComputeMeanSquaredError(pData1, pData2, dims, results);
		cout.precision(14);
		if(sumOverAttributes)
			cout << GVec::sumElements(results, dims);
		else
			GVec::print(cout, 14, results, dims);
	}
	cout << "\n";
}