Ejemplo n.º 1
0
void split(GArgReader& args)
{
	// Load
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	int pats = (int)pData->rows() - args.pop_uint();
	if(pats < 0)
		ThrowError("out of range. The data only has ", to_str(pData->rows()), " rows.");
	const char* szFilename1 = args.pop_string();
	const char* szFilename2 = args.pop_string();

	unsigned int nSeed = getpid() * (unsigned int)time(NULL);
	bool shouldShuffle = false;
	while(args.size() > 0){
		if(args.if_pop("-shuffle")){
			shouldShuffle = true;
		}else if(args.if_pop("-seed")){
			nSeed = args.pop_uint();
		}else
			ThrowError("Invalid option: ", args.peek());
	}

	// Shuffle if necessary
	GRand rng(nSeed);
	if(shouldShuffle){
		pData->shuffle(rng);
	}

	// Split
	GMatrix other(pData->relation());
	pData->splitBySize(&other, pats);
	pData->saveArff(szFilename1);
	other.saveArff(szFilename2);
}
Ejemplo n.º 2
0
void AddIndexAttribute(GArgReader& args)
{
	// Parse args
	const char* filename = args.pop_string();
	double nStartValue = 0.0;
	double nIncrement = 1.0;
	while(args.size() > 0)
	{
		if(args.if_pop("-start"))
			nStartValue = args.pop_double();
		else if(args.if_pop("-increment"))
			nIncrement = args.pop_double();
		else
			ThrowError("Invalid option: ", args.peek());
	}

	GMatrix* pData = loadData(filename);
	Holder<GMatrix> hData(pData);
	GArffRelation* pIndexRelation = new GArffRelation();
	pIndexRelation->addAttribute("index", 0, NULL);
	sp_relation pIndexRel = pIndexRelation;
	GMatrix indexes(pIndexRel);
	indexes.newRows(pData->rows());
	for(size_t i = 0; i < pData->rows(); i++)
		indexes.row(i)[0] = nStartValue + i * nIncrement;
	GMatrix* pUnified = GMatrix::mergeHoriz(&indexes, pData);
	Holder<GMatrix> hUnified(pUnified);
	pUnified->print(cout);
}
Ejemplo n.º 3
0
void neighbors(GArgReader& args)
{
	// Load the data
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	int neighborCount = args.pop_uint();

	// Find the neighbors
	GKdTree neighborFinder(pData, neighborCount, NULL, true);
	GTEMPBUF(size_t, neighbors, neighborCount);
	GTEMPBUF(double, distances, neighborCount);
	double sumClosest = 0;
	double sumAll = 0;
	for(size_t i = 0; i < pData->rows(); i++)
	{
		neighborFinder.neighbors(neighbors, distances, i);
		neighborFinder.sortNeighbors(neighbors, distances);
		sumClosest += sqrt(distances[0]);
		for(int j = 0; j < neighborCount; j++)
			sumAll += sqrt(distances[j]);
	}
	cout.precision(14);
	cout << "average closest neighbor distance = " << (sumClosest / pData->rows()) << "\n";
	cout << "average neighbor distance = " << (sumAll / (pData->rows() * neighborCount)) << "\n";
}
Ejemplo n.º 4
0
void DropMissingValues(GArgReader& args)
{
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	GRelation* pRelation = pData->relation().get();
	size_t dims = pRelation->size();
	for(size_t i = pData->rows() - 1; i < pData->rows(); i--)
	{
		double* pPat = pData->row(i);
		bool drop = false;
		for(size_t j = 0; j < dims; j++)
		{
			if(pRelation->valueCount(j) == 0)
			{
				if(pPat[j] == UNKNOWN_REAL_VALUE)
				{
					drop = true;
					break;
				}
			}
			else
			{
				if(pPat[j] == UNKNOWN_DISCRETE_VALUE)
				{
					drop = true;
					break;
				}
			}
		}
		if(drop)
			pData->deleteRow(i);
	}
	pData->print(cout);
}
Ejemplo n.º 5
0
// virtual
void GGaussianProcess::trainInner(const GMatrix& features, const GMatrix& labels)
{
	if(!features.relation().areContinuous())
		throw Ex("GGaussianProcess only supports continuous features. Perhaps you should wrap it in a GAutoFilter.");
	if(!labels.relation().areContinuous())
		throw Ex("GGaussianProcess only supports continuous labels. Perhaps you should wrap it in a GAutoFilter.");
	if(features.rows() <= m_maxSamples)
	{
		trainInnerInner(features, labels);
		return;
	}
	GMatrix f(features.relation().clone());
	GReleaseDataHolder hF(&f);
	GMatrix l(labels.relation().clone());
	GReleaseDataHolder hL(&l);
	for(size_t i = 0; i < features.rows(); i++)
	{
		f.takeRow((GVec*)&features[i]);
		l.takeRow((GVec*)&labels[i]);
	}
	while(f.rows() > m_maxSamples)
	{
		size_t i = (size_t)m_rand.next(f.rows());
		f.releaseRow(i);
		l.releaseRow(i);
	}
	trainInnerInner(f, l);
}
Ejemplo n.º 6
0
/***********************************************************************//**
 * @brief GMatrix to GSymMatrix storage class convertor
 *
 * @param[in] matrix General matrix (GMatrix).
 *
 * @exception GException::matrix_not_symmetric
 *            Matrix is not symmetric.
 *
 * Converts a general matrix into the symmetric storage class. If the input
 * matrix is not symmetric, an exception is thrown.
 ***************************************************************************/
GSymMatrix::GSymMatrix(const GMatrix& matrix)
{
    // Initialise class members for clean destruction
    init_members();

    // Allocate matrix memory
    alloc_members(matrix.rows(), matrix.cols());

    // Fill matrix
    for (int col = 0; col < matrix.cols(); ++col) {
        for (int row = col; row < matrix.rows(); ++row) {
            double value_ll = matrix(row,col);
            double value_ur = matrix(col,row);
            if (value_ll != value_ur) {
                throw GException::matrix_not_symmetric(G_CAST_MATRIX,
                                                       matrix.rows(),
                                                       matrix.cols());
            }
            (*this)(row, col) = matrix(row, col);
        }
    }

    // Return
    return;
}
Ejemplo n.º 7
0
void autoCorrelation(GArgReader& args)
{
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	size_t lag = std::min((size_t)256, pData->rows() / 2);
	size_t dims = pData->cols();
	GTEMPBUF(double, mean, dims);
	pData->centroid(mean);
	GMatrix ac(0, dims + 1);
	for(size_t i = 1; i <= lag; i++)
	{
		double* pRow = ac.newRow();
		*(pRow++) = (double)i;
		for(size_t j = 0; j < dims; j++)
		{
			*pRow = 0;
			size_t k;
			for(k = 0; k + i < pData->rows(); k++)
			{
				double* pA = pData->row(k);
				double* pB = pData->row(k + i);
				*pRow += (pA[j] - mean[j]) * (pB[j] - mean[j]);
			}
			*pRow /= k;
			pRow++;
		}
	}
	ac.print(cout);
}
Ejemplo n.º 8
0
void plot_it(const char* filename, GNeuralNet& nn, GMatrix& trainFeat, GMatrix& trainLab, GMatrix& testFeat, GMatrix& testLab)
{
	GSVG svg(1000, 500);
	double xmin = trainFeat[0][0];
	double xmax = testFeat[testFeat.rows() - 1][0];
	svg.newChart(xmin, std::min(trainLab.columnMin(0), testLab.columnMin(0)), xmax, std::max(trainLab.columnMax(0), testLab.columnMax(0)));
	svg.horizMarks(20);
	svg.vertMarks(20);
	double prevx = xmin;
	double prevy = 0.0;
	double step = (xmax - xmin) / 500.0;
	GVec x(1);
	GVec y(1);
	for(x[0] = prevx; x[0] < xmax; x[0] += step)
	{
		nn.predict(x, y);
		if(prevx != x[0])
			svg.line(prevx, prevy, x[0], y[0], 0.3);
		prevx = x[0];
		prevy = y[0];
	}
	for(size_t i = 0; i < trainLab.rows(); i++)
		svg.dot(trainFeat[i][0], trainLab[i][0], 0.4, 0xff000080);
	for(size_t i = 0; i < testLab.rows(); i++)
		svg.dot(testFeat[i][0], testLab[i][0], 0.4, 0xff800000);

	std::ofstream ofs;
	ofs.open(filename);
	svg.print(ofs);
}
Ejemplo n.º 9
0
/// Compute the anticipated belief vector that will result if the specified plan is executed.
void TransitionModel::getFinalBeliefs(const GVec& beliefs, const GMatrix& plan, GVec& outFinalBeliefs)
{
	if(plan.rows() > 0)
		anticipateNextBeliefs(beliefs, plan[0], outFinalBeliefs);
	for(size_t i = 1; i < plan.rows(); i++) {
		anticipateNextBeliefs(outFinalBeliefs, plan[i], outFinalBeliefs);
	}
}
Ejemplo n.º 10
0
void dropRows(GArgReader& args)
{
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	size_t newSize = args.pop_uint();
	while(pData->rows() > newSize)
		pData->deleteRow(pData->rows() - 1);
	pData->print(cout);
}
Ejemplo n.º 11
0
void squaredDistance(GArgReader& args)
{
	GMatrix* pA = loadData(args.pop_string());
	Holder<GMatrix> hA(pA);
	GMatrix* pB = loadData(args.pop_string());
	Holder<GMatrix> hB(pB);
	double d = pA->sumSquaredDifference(*pB, false);
	cout << "Sum squared distance: " << d << "\n";
	cout << "Mean squared distance: " << (d / pA->rows()) << "\n";
	cout << "Root mean squared distance: " << sqrt(d / pA->rows()) << "\n";
}
Ejemplo n.º 12
0
void GNaiveBayes_testMath()
{
	const char* trainFile =
	"@RELATION test\n"
	"@ATTRIBUTE a {t,f}\n"
	"@ATTRIBUTE b {r,g,b}\n"
	"@ATTRIBUTE c {y,n}\n"
	"@DATA\n"
	"t,r,y\n"
	"f,r,n\n"
	"t,g,y\n"
	"f,g,y\n"
	"f,g,n\n"
	"t,r,n\n"
	"t,r,y\n"
	"t,b,y\n"
	"f,r,y\n"
	"f,g,n\n"
	"f,b,y\n"
	"t,r,n\n";
	GMatrix train;
	train.parseArff(trainFile, strlen(trainFile));
	GMatrix* pFeatures = train.cloneSub(0, 0, train.rows(), 2);
	std::unique_ptr<GMatrix> hFeatures(pFeatures);
	GMatrix* pLabels = train.cloneSub(0, 2, train.rows(), 1);
	std::unique_ptr<GMatrix> hLabels(pLabels);
	GNaiveBayes nb;
	nb.setEquivalentSampleSize(0.0);
	nb.train(*pFeatures, *pLabels);
	GPrediction out;
	GVec pat(2);
	pat[0] = 0; pat[1] = 0;
	nb.predictDistribution(pat, &out);
	GNaiveBayes_CheckResults(7.0/12.0, 4.0/7.0*3.0/7.0, 5.0/12.0, 2.0/5.0*3.0/5.0, &out);
	pat[0] = 0; pat[1] = 1;
	nb.predictDistribution(pat, &out);
	GNaiveBayes_CheckResults(7.0/12.0, 4.0/7.0*2.0/7.0, 5.0/12.0, 2.0/5.0*2.0/5.0, &out);
	pat[0] = 0; pat[1] = 2;
	nb.predictDistribution(pat, &out);
	GNaiveBayes_CheckResults(7.0/12.0, 4.0/7.0*2.0/7.0, 5.0/12.0, 2.0/5.0*0.0/5.0, &out);
	pat[0] = 1; pat[1] = 0;
	nb.predictDistribution(pat, &out);
	GNaiveBayes_CheckResults(7.0/12.0, 3.0/7.0*3.0/7.0, 5.0/12.0, 3.0/5.0*3.0/5.0, &out);
	pat[0] = 1; pat[1] = 1;
	nb.predictDistribution(pat, &out);
	GNaiveBayes_CheckResults(7.0/12.0, 3.0/7.0*2.0/7.0, 5.0/12.0, 3.0/5.0*2.0/5.0, &out);
	pat[0] = 1; pat[1] = 2;
	nb.predictDistribution(pat, &out);
	GNaiveBayes_CheckResults(7.0/12.0, 3.0/7.0*2.0/7.0, 5.0/12.0, 3.0/5.0*0.0/5.0, &out);
}
Ejemplo n.º 13
0
void ManifoldSculpting(GArgReader& args)
{
	// Load the file and params
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	unsigned int nSeed = getpid() * (unsigned int)time(NULL);
	GRand prng(nSeed);
	GNeighborFinder* pNF = instantiateNeighborFinder(pData, &prng, args);
	Holder<GNeighborFinder> hNF(pNF);
	size_t targetDims = args.pop_uint();

	// Parse Options
	const char* szPreprocessedData = NULL;
	double scaleRate = 0.999;
	while(args.size() > 0)
	{
		if(args.if_pop("-seed"))
			prng.setSeed(args.pop_uint());
		else if(args.if_pop("-continue"))
			szPreprocessedData = args.pop_string();
		else if(args.if_pop("-scalerate"))
			scaleRate = args.pop_double();
		else
			throw Ex("Invalid option: ", args.peek());
	}

	// Load the hint data
	GMatrix* pDataHint = NULL;
	Holder<GMatrix> hDataHint(NULL);
	if(szPreprocessedData)
	{
		pDataHint = loadData(szPreprocessedData);
		hDataHint.reset(pDataHint);
		if(pDataHint->relation()->size() != targetDims)
			throw Ex("Wrong number of dims in the hint data");
		if(pDataHint->rows() != pData->rows())
			throw Ex("Wrong number of patterns in the hint data");
	}

	// Transform the data
	GManifoldSculpting transform(pNF->neighborCount(), targetDims, &prng);
	transform.setSquishingRate(scaleRate);
	if(pDataHint)
		transform.setPreprocessedData(hDataHint.release());
	transform.setNeighborFinder(pNF);
	GMatrix* pDataAfter = transform.doit(*pData);
	Holder<GMatrix> hDataAfter(pDataAfter);
	pDataAfter->print(cout);
}
Ejemplo n.º 14
0
void singularValueDecomposition(GArgReader& args)
{
	// Load
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);

	// Parse options
	string ufilename = "u.arff";
	string sigmafilename;
	string vfilename = "v.arff";
	int maxIters = 100;
	while(args.size() > 0)
	{
		if(args.if_pop("-ufilename"))
			ufilename = args.pop_string();
		else if(args.if_pop("-sigmafilename"))
			sigmafilename = args.pop_string();
		else if(args.if_pop("-vfilename"))
			vfilename = args.pop_string();
		else if(args.if_pop("-maxiters"))
			maxIters = args.pop_uint();
		else
			ThrowError("Invalid option: ", args.peek());
	}

	GMatrix* pU;
	double* pDiag;
	GMatrix* pV;
	pData->singularValueDecomposition(&pU, &pDiag, &pV, false, maxIters);
	Holder<GMatrix> hU(pU);
	ArrayHolder<double> hDiag(pDiag);
	Holder<GMatrix> hV(pV);
	pU->saveArff(ufilename.c_str());
	pV->saveArff(vfilename.c_str());
	if(sigmafilename.length() > 0)
	{
		GMatrix sigma(pU->rows(), pV->rows());
		sigma.setAll(0.0);
		size_t m = std::min(sigma.rows(), (size_t)sigma.cols());
		for(size_t i = 0; i < m; i++)
			sigma.row(i)[i] = pDiag[i];
		sigma.saveArff(sigmafilename.c_str());
	}
	else
	{
		GVec::print(cout, 14, pDiag, std::min(pU->rows(), pV->rows()));
		cout << "\n";
	}
}
Ejemplo n.º 15
0
void GLinearRegressor::refine(GMatrix& features, GMatrix& labels, double learningRate, size_t epochs, double learningRateDecayFactor)
{
	size_t fDims = features.cols();
	size_t lDims = labels.cols();
	size_t* pIndexes = new size_t[features.rows()];
	ArrayHolder<size_t> hIndexes(pIndexes);
	GIndexVec::makeIndexVec(pIndexes, features.rows());
	for(size_t i = 0; i < epochs; i++)
	{
		GIndexVec::shuffle(pIndexes, features.rows(), &m_rand);
		size_t* pIndex = pIndexes;
		for(size_t j = 0; j < features.rows(); j++)
		{
			double* pFeat = features[*pIndex];
			double* pLab = labels[*pIndex];
			double* pBias = m_pEpsilon;
			for(size_t k = 0; k < lDims; k++)
			{
				double err = *pLab - (GVec::dotProduct(pFeat, m_pBeta->row(k), fDims) + *pBias);
				double* pF = pFeat;
				double lr = learningRate;
				double mag = 0.0;
				for(size_t l = 0; l < fDims; l++)
				{
					double d = *pF * err;
					mag += (d * d);
					pF++;
				}
				mag += err * err;
				if(mag > 1.0)
					lr /= mag;
				pF = pFeat;
				double* pW = m_pBeta->row(k);
				for(size_t l = 0; l < fDims; l++)
				{
					*pW += *pF * lr * err;
					pF++;
					pW++;
				}
				*pBias += learningRate * err;
				pLab++;
				pBias++;
			}
			pIndex++;
		}
		learningRate *= learningRateDecayFactor;
	}
}
Ejemplo n.º 16
0
void dropRandomValues(GArgReader& args)
{
	GMatrix* pData = loadData(args.pop_string());
	double portion = args.pop_double();

	// Parse the options
	unsigned int seed = getpid() * (unsigned int)time(NULL);
	while(args.next_is_flag())
	{
		if(args.if_pop("-seed"))
			seed = args.pop_uint();
		else
			ThrowError("Invalid option: ", args.peek());
	}

	GRand rand(seed);
	size_t n = pData->rows() * pData->cols();
	size_t k = size_t(portion * n);
	for(size_t i = 0; i < pData->cols(); i++)
	{
		size_t vals = pData->relation()->valueCount(i);
		if(vals == 0)
		{
			for(size_t j = 0; j < pData->rows(); j++)
			{
				if(rand.next(n) < k)
				{
					pData->row(j)[i] = UNKNOWN_REAL_VALUE;
					k--;
				}
				n--;
			}
		}
		else
		{
			for(size_t j = 0; j < pData->rows(); j++)
			{
				if(rand.next(n) < k)
				{
					pData->row(j)[i] = UNKNOWN_DISCRETE_VALUE;
					k--;
				}
				n--;
			}
		}
	}
	pData->print(cout);
}
Ejemplo n.º 17
0
void enumerateValues(GArgReader& args)
{
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	size_t col = args.pop_uint();
	if(pData->relation()->valueCount(col) > 0)
		((GArffRelation*)pData->relation().get())->setAttrValueCount(col, 0);
	else
	{
		size_t n = 0;
		map<double,size_t> themap;
		for(size_t i = 0; i < pData->rows(); i++)
		{
			double* pRow = pData->row(i);
			map<double,size_t>::iterator it = themap.find(pRow[col]);
			if(it == themap.end())
			{
				themap[pRow[col]] = n;
				pRow[col] = (double)n;
				n++;
			}
			else
				pRow[col] = (double)it->second;
		}
	}
	pData->print(cout);
}
Ejemplo n.º 18
0
void aggregateCols(GArgReader& args)
{
	size_t c = args.pop_uint();
	vector<string> files;
	GFile::fileList(files);
	GMatrix* pResults = NULL;
	Holder<GMatrix> hResults;
	size_t i = 0;
	for(vector<string>::iterator it = files.begin(); it != files.end(); it++)
	{
		PathData pd;
		GFile::parsePath(it->c_str(), &pd);
		if(strcmp(it->c_str() + pd.extStart, ".arff") != 0)
			continue;
		GMatrix* pData = loadData(it->c_str());
		Holder<GMatrix> hData(pData);
		if(!pResults)
		{
			pResults = new GMatrix(pData->rows(), files.size());
			hResults.reset(pResults);
		}
		pResults->copyColumns(i, pData, c, 1);
		i++;
	}
	pResults->print(cout);
}
Ejemplo n.º 19
0
void addNoise(GArgReader& args)
{
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	double dev = args.pop_double();

	// Parse the options
	unsigned int seed = getpid() * (unsigned int)time(NULL);
	int excludeLast = 0;
	while(args.next_is_flag())
	{
		if(args.if_pop("-seed"))
			seed = args.pop_uint();
		else if(args.if_pop("-excludelast"))
			excludeLast = args.pop_uint();
		else
			ThrowError("Invalid neighbor finder option: ", args.peek());
	}

	GRand prng(seed);
	size_t cols = pData->cols() - excludeLast;
	for(size_t r = 0; r < pData->rows(); r++)
	{
		double* pRow = pData->row(r);
		for(size_t c = 0; c < cols; c++)
			*(pRow++) += dev * prng.normal();
	}
	pData->print(cout);
}
Ejemplo n.º 20
0
void test_transform_mergevert()
{
	// Make some input files
	TempFileMaker tempFile1("a.arff",
		"@RELATION test\n"
		"@ATTRIBUTE a1 continuous\n"
		"@ATTRIBUTE a2 { alice, bob }\n"
		"@ATTRIBUTE a3 { true, false }\n"
		"@DATA\n"
		"1.2, alice, true\n"
		"2.3, bob, false\n"
		);
	TempFileMaker tempFile2("b.arff",
		"@RELATION test\n"
		"@ATTRIBUTE a1 continuous\n"
		"@ATTRIBUTE a2 { charlie, bob }\n"
		"@ATTRIBUTE a3 { false, true }\n"
		"@DATA\n"
		"3.4, bob, true\n"
		"4.5, charlie, false\n"
		);

	// Execute the command
	GPipe pipeStdOut;
	if(sysExec("waffles_transform", "mergevert a.arff b.arff", &pipeStdOut) != 0)
		throw Ex("exit status indicates failure");
	char buf[512];
	size_t len = pipeStdOut.read(buf, 512);
	if(len == 512)
		throw Ex("need a bigger buffer");
	buf[len] = '\0';

	// Check the results
	GMatrix M;
	M.parseArff(buf, strlen(buf));
	if(M.rows() != 4 || M.cols() != 3)
		throw Ex("failed");
	if(M.relation().valueCount(0) != 0)
		throw Ex("failed");
	if(M.relation().valueCount(1) != 3)
		throw Ex("failed");
	if(M.relation().valueCount(2) != 2)
		throw Ex("failed");
	std::ostringstream oss;
	const GArffRelation* pRel = (const GArffRelation*)&M.relation();
	pRel->printAttrValue(oss, 1, 2.0);
	string s = oss.str();
	if(strcmp(s.c_str(), "charlie") != 0)
		throw Ex("failed");
	if(M[0][0] != 1.2 || M[1][0] != 2.3 || M[2][0] != 3.4 || M[3][0] != 4.5)
		throw Ex("failed");
	if(M[0][1] != 0 || M[1][1] != 1 || M[2][1] != 1 || M[3][1] != 2)
		throw Ex("failed");
	if(M[0][2] != 0 || M[1][2] != 1 || M[2][2] != 0 || M[3][2] != 1)
		throw Ex("failed");
}
Ejemplo n.º 21
0
void Discretize(GArgReader& args)
{
	// Load the file
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);

	// Parse Options
	size_t nFirst = 0;
	size_t nLast = pData->relation()->size() - 1;
	size_t nBuckets = std::max(2, (int)floor(sqrt((double)pData->rows() + 0.5)));
	while(args.size() > 0)
	{
		if(args.if_pop("-buckets"))
			nBuckets = args.pop_uint();
		else if(args.if_pop("-colrange"))
		{
			nFirst = args.pop_uint();
			nLast = args.pop_uint();
		}
		else
			ThrowError("Invalid option: ", args.peek());
	}
	if(nFirst < 0 || nLast >= pData->relation()->size() || nLast < nFirst)
		ThrowError("column index out of range");

	// Discretize the continuous attributes in the specified range
	for(size_t i = nFirst; i <= nLast; i++)
	{
		if(pData->relation()->valueCount(i) != 0)
			continue;
		double min, range;
		pData->minAndRange(i, &min, &range);
		for(size_t j = 0; j < pData->rows(); j++)
		{
			double* pPat = pData->row(j);
			pPat[i] = (double)std::max((size_t)0, std::min(nBuckets - 1, (size_t)floor(((pPat[i] - min) * nBuckets) / range)));
		}
		((GArffRelation*)pData->relation().get())->setAttrValueCount(i, nBuckets);
	}

	// Print results
	pData->print(cout);
}
Ejemplo n.º 22
0
// virtual
void GNaiveBayes::trainInner(const GMatrix& features, const GMatrix& labels)
{
	if(!features.relation().areNominal())
		throw Ex("GNaiveBayes only supports nominal features. Perhaps you should wrap it in a GAutoFilter.");
	if(!labels.relation().areNominal())
		throw Ex("GNaiveBayes only supports nominal labels. Perhaps you should wrap it in a GAutoFilter.");
	beginIncrementalLearningInner(features.relation(), labels.relation());
	for(size_t n = 0; n < features.rows(); n++)
		trainIncremental(features[n], labels[n]);
}
Ejemplo n.º 23
0
// virtual
void GNaiveInstance::trainInner(const GMatrix& features, const GMatrix& labels)
{
	if(!features.relation().areContinuous())
		throw Ex("GNaiveInstance only supports continuous features. Perhaps you should wrap it in a GAutoFilter.");
	if(!labels.relation().areContinuous())
		throw Ex("GNaiveInstance only supports continuous labels. Perhaps you should wrap it in a GAutoFilter.");

	beginIncrementalLearningInner(features.relation(), labels.relation());
	for(size_t i = 0; i < features.rows(); i++)
		trainIncremental(features[i], labels[i]);
}
Ejemplo n.º 24
0
void test_recommend_fillmissingvalues()
{
	// Make some input files
	TempFileMaker tempFile1("a.arff",
		"@RELATION test\n"
		"@ATTRIBUTE a1 { a, b, c }\n"
		"@ATTRIBUTE a2 continuous\n"
		"@ATTRIBUTE a3 { d, e, f }\n"
		"@ATTRIBUTE a4 { g, h, i }\n"
		"@DATA\n"
		"a, ?, f, i\n"
		"?, 2, ?, i\n"
		"b, ?, d, ?\n"
		"?, 4, ?, ?\n"
		"?, ?, e, g\n"
		"?, ?, e, ?\n"
		"a, ?, ?, h\n"
		"\n"
		);

	// Execute the command
	GPipe pipeStdOut;
	if(sysExec("waffles_recommend", "fillmissingvalues a.arff baseline", &pipeStdOut) != 0)
		throw Ex("exit status indicates failure");
	char buf[512];
	size_t len = pipeStdOut.read(buf, 512);
	if(len == 512)
		throw Ex("need a bigger buffer");
	buf[len] = '\0';

	// Check the results
	GMatrix M;
	M.parseArff(buf, strlen(buf));
	if(M.rows() != 7 || M.cols() != 4)
		throw Ex("failed");
	if(M[0][0] != 0)
		throw Ex("failed");
	if(M[0][1] != 3)
		throw Ex("failed");
	if(M[1][1] != 2)
		throw Ex("failed");
	if(M[2][1] != 3)
		throw Ex("failed");
	if(M[3][3] != 2)
		throw Ex("failed");
	if(M[4][0] != 0)
		throw Ex("failed");
	if(M[5][1] != 3)
		throw Ex("failed");
	if(M[6][2] != 1)
		throw Ex("failed");
	if(M[6][3] != 1)
		throw Ex("failed");
}
Ejemplo n.º 25
0
	void TransformData(const double* pVector)
	{
		m_transform.fromVector(pVector + m_attrs, m_attrs);
		for(size_t i = 0; i < m_pData2->rows(); i++)
		{
			double* pPatIn = m_pData2->row(i);
			double* pPatOut = m_transformed.row(i);
			m_transform.multiply(pPatIn, pPatOut);
			GVec::add(pPatOut, pVector, m_attrs);
		}
	}
Ejemplo n.º 26
0
// virtual
void GBayesianModelCombination::determineWeights(GMatrix& features, GMatrix& labels)
{
	double* pWeights = new double[m_models.size()];
	ArrayHolder<double> hWeights(pWeights);
	GVec::setAll(pWeights, 0.0, m_models.size());
	double sumWeight = 0.0;
	double maxLogProb = -1e38;
	for(size_t i = 0; i < m_samples; i++)
	{
		// Set weights randomly from a dirichlet distribution with unifrom probabilities
		for(vector<GWeightedModel*>::iterator it = m_models.begin(); it != m_models.end(); it++)
			(*it)->m_weight = m_rand.exponential();
		normalizeWeights();

		// Evaluate accuracy
		double d = 1.0 - (sumSquaredError(features, labels) / labels.rows());
		double logProbEnsembleGivenData;
		if(d <= 0.0)
			logProbEnsembleGivenData = -1e38;
		else if(d == 1.0)
			logProbEnsembleGivenData = 0.0;
		else
			logProbEnsembleGivenData = features.rows() * (d * log(d) + (1.0 - d) * log(1.0 - d));

		// Update the weights
		if(logProbEnsembleGivenData > maxLogProb)
		{
			GVec::multiply(pWeights, exp(maxLogProb - logProbEnsembleGivenData), m_models.size());
			maxLogProb = logProbEnsembleGivenData;
		}
		double w = exp(logProbEnsembleGivenData - maxLogProb);
		GVec::multiply(pWeights, sumWeight / (sumWeight + w), m_models.size());
		double* pW = pWeights;
		for(vector<GWeightedModel*>::iterator it = m_models.begin(); it != m_models.end(); it++)
			*(pW++) += w * (*it)->m_weight;
		sumWeight += w;
	}
	double* pW = pWeights;
	for(vector<GWeightedModel*>::iterator it = m_models.begin(); it != m_models.end(); it++)
		(*it)->m_weight = *(pW++);
}
Ejemplo n.º 27
0
// virtual
void GLinearDistribution::trainInner(GMatrix& features, GMatrix& labels)
{
	// Init A with the inverse of the weights prior covariance matrix
	size_t dims = features.cols();
	GMatrix a(dims, dims);
	a.setAll(0.0);

	// Init XY
	size_t labelDims = labels.cols();
	GMatrix xy(dims, labelDims);
	xy.setAll(0.0);

	// Train on each instance
	double w = 1.0 / (m_noiseDev * m_noiseDev);
	for(size_t i = 0; i < features.rows(); i++)
	{
		// Update A
		double* pFeat = features[i];
		for(size_t j = 0; j < dims; j++)
		{
			double* pEl = a[j];
			for(size_t k = 0; k < dims; k++)
			{
				*pEl += pFeat[j] * pFeat[k];
				pEl++;
			}
		}

		// Update XY
		double* pLab = labels[i];
		for(size_t j = 0; j < dims; j++)
		{
			double* pEl = xy[j];
			for(size_t k = 0; k < labelDims; k++)
			{
				*pEl += pFeat[j] * pLab[k];
				pEl++;
			}
		}
	}
	a.multiply(w);
	xy.multiply(w);

	// Compute final matrices
	clear();
	m_pAInv = a.pseudoInverse();
	GAssert(m_pAInv->cols() == dims);
	GAssert(m_pAInv->rows() == dims);
	m_pWBar = GMatrix::multiply(xy, *m_pAInv, true, true);
	GAssert(m_pWBar->cols() == dims);
	GAssert(m_pWBar->rows() == labelDims);
	m_pBuf = new double[dims];
}
Ejemplo n.º 28
0
// virtual
void GPolynomial::trainInner(GMatrix& features, GMatrix& labels)
{
	GMatrix labelCol(labels.rows(), 1);
	clear();
	for(size_t i = 0; i < labels.cols(); i++)
	{
		GPolynomialSingleLabel* pPSL = new GPolynomialSingleLabel(m_controlPoints);
		m_polys.push_back(pPSL);
		labelCol.copyColumns(0, &labels, i, 1);
		pPSL->train(features, labelCol);
	}
}
Ejemplo n.º 29
0
void LoadData(GArgReader &args, std::unique_ptr<GMatrix> &hOutput)
{
	// Load the dataset by extension
	if(args.size() < 1)
		throw Ex("Expected the filename of a datset. (Found end of arguments.)");
	const char* szFilename = args.pop_string();
	PathData pd;
	GFile::parsePath(szFilename, &pd);
	GMatrix data;
	vector<size_t> abortedCols;
	vector<size_t> ambiguousCols;
	const char *input_type;
	if (args.next_is_flag() && args.if_pop("-input_type")) {
		input_type = args.pop_string();
	} else { /* deduce it from extension (if any) */
		input_type = szFilename + pd.extStart;
		if (*input_type != '.') /* no extension - assume ARFF */
			input_type = "arff";
		else
			input_type++;
	}
	
	// Now load the data
	if(_stricmp(input_type, "arff") == 0)
	{
		data.loadArff(szFilename);
	}
	else if(_stricmp(input_type, "csv") == 0)
	{
		GCSVParser parser;
		parser.parse(data, szFilename);
		cerr << "\nParsing Report:\n";
		for(size_t i = 0; i < data.cols(); i++)
			cerr << to_str(i) << ") " << parser.report(i) << "\n";
	}
	else if(_stricmp(input_type, "dat") == 0)
	{
		GCSVParser parser;
		parser.setSeparator('\0');
		parser.parse(data, szFilename);
		cerr << "\nParsing Report:\n";
		for(size_t i = 0; i < data.cols(); i++)
			cerr << to_str(i) << ") " << parser.report(i) << "\n";
	}
	else
	{
		throw Ex("Unsupported file format: ", szFilename + pd.extStart);
	}
	
	// Split data into a feature matrix and a label matrix
	GMatrix* pFeatures = data.cloneSub(0, 0, data.rows(), data.cols());
	hOutput.reset(pFeatures);
}
Ejemplo n.º 30
0
// virtual
void GBayesianModelCombination::determineWeights(const GMatrix& features, const GMatrix& labels)
{
	GQUICKVEC(weights, m_models.size());
	weights.fill(0.0);
	double sumWeight = 0.0;
	double maxLogProb = -1e38;
	for(size_t i = 0; i < m_samples; i++)
	{
		// Set weights randomly from a dirichlet distribution with unifrom probabilities
		for(vector<GWeightedModel*>::iterator it = m_models.begin(); it != m_models.end(); it++)
			(*it)->m_weight = m_rand.exponential();
		normalizeWeights();

		// Evaluate accuracy
		double d = 1.0 - (sumSquaredError(features, labels) / labels.rows());
		double logProbEnsembleGivenData;
		if(d <= 0.0)
			logProbEnsembleGivenData = -1e38;
		else if(d == 1.0)
			logProbEnsembleGivenData = 0.0;
		else
			logProbEnsembleGivenData = features.rows() * (d * log(d) + (1.0 - d) * log(1.0 - d));

		// Update the weights
		if(logProbEnsembleGivenData > maxLogProb)
		{
			weights *= exp(maxLogProb - logProbEnsembleGivenData);
			maxLogProb = logProbEnsembleGivenData;
		}
		double w = exp(logProbEnsembleGivenData - maxLogProb);
		weights *= (sumWeight / (sumWeight + w));
		size_t pos = 0;
		for(vector<GWeightedModel*>::iterator it = m_models.begin(); it != m_models.end(); it++)
			weights[pos++] += (w * (*it)->m_weight);
		sumWeight += w;
	}
	size_t pos = 0;
	for(vector<GWeightedModel*>::iterator it = m_models.begin(); it != m_models.end(); it++)
		(*it)->m_weight = weights[pos++];
}