예제 #1
0
파일: GLinear.cpp 프로젝트: b2020b/waffles
// virtual
void GLinearRegressor::trainInner(const GMatrix& features, const GMatrix& labels)
{
	if(!features.relation().areContinuous())
		throw Ex("GLinearRegressor only supports continuous features. Perhaps you should wrap it in a GAutoFilter.");
	if(!labels.relation().areContinuous())
		throw Ex("GLinearRegressor only supports continuous labels. Perhaps you should wrap it in a GAutoFilter.");

	// Use a fast, but not-very-numerically-stable technique to compute an initial approximation for beta and epsilon
	clear();
	GMatrix* pAll = GMatrix::mergeHoriz(&features, &labels);
	Holder<GMatrix> hAll(pAll);
	GPCA pca(features.cols());
	pca.train(*pAll);
	size_t inputs = features.cols();
	size_t outputs = labels.cols();
	GMatrix f(inputs, inputs);
	GMatrix l(inputs, outputs);
	for(size_t i = 0; i < inputs; i++)
	{
		GVec::copy(f[i].data(), pca.basis()->row(i).data(), inputs);
		double sqmag = f[i].squaredMagnitude();
		if(sqmag > 1e-10)
			f[i] *= 1.0 / sqmag;
		l[i].set(pca.basis()->row(i).data() + inputs, outputs);
	}
	m_pBeta = GMatrix::multiply(l, f, true, false);
	m_epsilon.resize(outputs);
	GVecWrapper vw(pca.centroid().data(), m_pBeta->cols());
	m_pBeta->multiply(vw.vec(), m_epsilon, false);
	m_epsilon *= -1.0;
	GVec::add(m_epsilon.data(), pca.centroid().data() + inputs, outputs);

	// Refine the results using gradient descent
	refine(features, labels, 0.06, 20, 0.75);
}
예제 #2
0
void loadData(GMatrix& m, const char* szFilename)
{
	// Load the dataset by extension
	PathData pd;
	GFile::parsePath(szFilename, &pd);
	if(_stricmp(szFilename + pd.extStart, ".arff") == 0)
		m.loadArff(szFilename);
	else if(_stricmp(szFilename + pd.extStart, ".csv") == 0)
	{
		GCSVParser parser;
		parser.parse(m, szFilename);
		cerr << "\nParsing Report:\n";
		for(size_t i = 0; i < m.cols(); i++)
			cerr << to_str(i) << ") " << parser.report(i) << "\n";
	}
	else if(_stricmp(szFilename + pd.extStart, ".dat") == 0)
	{
		GCSVParser parser;
		parser.setSeparator('\0');
		parser.parse(m, szFilename);
		cerr << "\nParsing Report:\n";
		for(size_t i = 0; i < m.cols(); i++)
			cerr << to_str(i) << ") " << parser.report(i) << "\n";
	}
	else
		throw Ex("Unsupported file format: ", szFilename + pd.extStart);
}
예제 #3
0
// virtual
void GLinearRegressor::trainInner(GMatrix& features, GMatrix& labels)
{
	// Use a fast, but not-very-numerically-stable technique to compute an initial approximation for beta and epsilon
	clear();
	GMatrix* pAll = GMatrix::mergeHoriz(&features, &labels);
	Holder<GMatrix> hAll(pAll);
	GPCA pca(features.cols(), &m_rand);
	pca.train(*pAll);
	size_t inputs = features.cols();
	size_t outputs = labels.cols();
	GMatrix f(inputs, inputs);
	GMatrix l(inputs, outputs);
	for(size_t i = 0; i < inputs; i++)
	{
		GVec::copy(f[i], pca.basis(i), inputs);
		double sqmag = GVec::squaredMagnitude(f[i], inputs);
		if(sqmag > 1e-10)
			GVec::multiply(f[i], 1.0 / sqmag, inputs);
		GVec::copy(l[i], pca.basis(i) + inputs, outputs);
	}
	m_pBeta = GMatrix::multiply(l, f, true, false);
	m_pEpsilon = new double[outputs];
	m_pBeta->multiply(pca.mean(), m_pEpsilon, false);
	GVec::multiply(m_pEpsilon, -1.0, outputs);
	GVec::add(m_pEpsilon, pca.mean() + inputs, outputs);

	// Refine the results using gradient descent
	refine(features, labels, 0.06, 20, 0.75);
}
예제 #4
0
/***********************************************************************//**
 * @brief GMatrix to GSymMatrix storage class convertor
 *
 * @param[in] matrix General matrix (GMatrix).
 *
 * @exception GException::matrix_not_symmetric
 *            Matrix is not symmetric.
 *
 * Converts a general matrix into the symmetric storage class. If the input
 * matrix is not symmetric, an exception is thrown.
 ***************************************************************************/
GSymMatrix::GSymMatrix(const GMatrix& matrix)
{
    // Initialise class members for clean destruction
    init_members();

    // Allocate matrix memory
    alloc_members(matrix.rows(), matrix.cols());

    // Fill matrix
    for (int col = 0; col < matrix.cols(); ++col) {
        for (int row = col; row < matrix.rows(); ++row) {
            double value_ll = matrix(row,col);
            double value_ur = matrix(col,row);
            if (value_ll != value_ur) {
                throw GException::matrix_not_symmetric(G_CAST_MATRIX,
                                                       matrix.rows(),
                                                       matrix.cols());
            }
            (*this)(row, col) = matrix(row, col);
        }
    }

    // Return
    return;
}
예제 #5
0
void GPolynomialSingleLabel::train(GMatrix& features, GMatrix& labels)
{
	GAssert(labels.cols() == 1);
	init(features.cols());
	GPolynomialRegressCritic critic(this, features, labels);
	//GStochasticGreedySearch search(&critic);
	GMomentumGreedySearch search(&critic);
	search.searchUntil(100, 30, .01);
	setCoefficients(search.currentVector());
	fromBezierCoefficients();
}
예제 #6
0
파일: main.cpp 프로젝트: BaskWind/waffles
void LoadData(GArgReader &args, std::unique_ptr<GMatrix> &hOutput)
{
	// Load the dataset by extension
	if(args.size() < 1)
		throw Ex("Expected the filename of a datset. (Found end of arguments.)");
	const char* szFilename = args.pop_string();
	PathData pd;
	GFile::parsePath(szFilename, &pd);
	GMatrix data;
	vector<size_t> abortedCols;
	vector<size_t> ambiguousCols;
	const char *input_type;
	if (args.next_is_flag() && args.if_pop("-input_type")) {
		input_type = args.pop_string();
	} else { /* deduce it from extension (if any) */
		input_type = szFilename + pd.extStart;
		if (*input_type != '.') /* no extension - assume ARFF */
			input_type = "arff";
		else
			input_type++;
	}
	
	// Now load the data
	if(_stricmp(input_type, "arff") == 0)
	{
		data.loadArff(szFilename);
	}
	else if(_stricmp(input_type, "csv") == 0)
	{
		GCSVParser parser;
		parser.parse(data, szFilename);
		cerr << "\nParsing Report:\n";
		for(size_t i = 0; i < data.cols(); i++)
			cerr << to_str(i) << ") " << parser.report(i) << "\n";
	}
	else if(_stricmp(input_type, "dat") == 0)
	{
		GCSVParser parser;
		parser.setSeparator('\0');
		parser.parse(data, szFilename);
		cerr << "\nParsing Report:\n";
		for(size_t i = 0; i < data.cols(); i++)
			cerr << to_str(i) << ") " << parser.report(i) << "\n";
	}
	else
	{
		throw Ex("Unsupported file format: ", szFilename + pd.extStart);
	}
	
	// Split data into a feature matrix and a label matrix
	GMatrix* pFeatures = data.cloneSub(0, 0, data.rows(), data.cols());
	hOutput.reset(pFeatures);
}
예제 #7
0
// virtual
void GLinearDistribution::trainInner(GMatrix& features, GMatrix& labels)
{
	// Init A with the inverse of the weights prior covariance matrix
	size_t dims = features.cols();
	GMatrix a(dims, dims);
	a.setAll(0.0);

	// Init XY
	size_t labelDims = labels.cols();
	GMatrix xy(dims, labelDims);
	xy.setAll(0.0);

	// Train on each instance
	double w = 1.0 / (m_noiseDev * m_noiseDev);
	for(size_t i = 0; i < features.rows(); i++)
	{
		// Update A
		double* pFeat = features[i];
		for(size_t j = 0; j < dims; j++)
		{
			double* pEl = a[j];
			for(size_t k = 0; k < dims; k++)
			{
				*pEl += pFeat[j] * pFeat[k];
				pEl++;
			}
		}

		// Update XY
		double* pLab = labels[i];
		for(size_t j = 0; j < dims; j++)
		{
			double* pEl = xy[j];
			for(size_t k = 0; k < labelDims; k++)
			{
				*pEl += pFeat[j] * pLab[k];
				pEl++;
			}
		}
	}
	a.multiply(w);
	xy.multiply(w);

	// Compute final matrices
	clear();
	m_pAInv = a.pseudoInverse();
	GAssert(m_pAInv->cols() == dims);
	GAssert(m_pAInv->rows() == dims);
	m_pWBar = GMatrix::multiply(xy, *m_pAInv, true, true);
	GAssert(m_pWBar->cols() == dims);
	GAssert(m_pWBar->rows() == labelDims);
	m_pBuf = new double[dims];
}
예제 #8
0
파일: GLinear.cpp 프로젝트: b2020b/waffles
// virtual
void GLinearDistribution::trainInner(const GMatrix& features, const GMatrix& labels)
{
	if(!features.relation().areContinuous())
		throw Ex("GLinearDistribution only supports continuous features. Perhaps you should wrap it in a GAutoFilter.");
	if(!labels.relation().areContinuous())
		throw Ex("GLinearDistribution only supports continuous labels. Perhaps you should wrap it in a GAutoFilter.");

	// Init A with the inverse of the weights prior covariance matrix
	size_t dims = features.cols();
	GMatrix a(dims, dims);
	a.setAll(0.0);

	// Init XY
	size_t labelDims = labels.cols();
	GMatrix xy(dims, labelDims);
	xy.setAll(0.0);

	// Train on each instance
	double w = 1.0 / (m_noiseDev * m_noiseDev);
	for(size_t i = 0; i < features.rows(); i++)
	{
		// Update A
		const GVec& feat = features[i];
		for(size_t j = 0; j < dims; j++)
		{
			GVec& el = a[j];
			for(size_t k = 0; k < dims; k++)
				el[k] += feat[j] * feat[k];
		}

		// Update XY
		const GVec& lab = labels[i];
		for(size_t j = 0; j < dims; j++)
		{
			GVec& el = xy[j];
			for(size_t k = 0; k < labelDims; k++)
				el[k] += feat[j] * lab[k];
		}
	}
	a.multiply(w);
	xy.multiply(w);

	// Compute final matrices
	clear();
	m_pAInv = a.pseudoInverse();
	GAssert(m_pAInv->cols() == dims);
	GAssert(m_pAInv->rows() == dims);
	m_pWBar = GMatrix::multiply(xy, *m_pAInv, true, true);
	GAssert(m_pWBar->cols() == dims);
	GAssert(m_pWBar->rows() == labelDims);
	m_buf.resize(dims);
}
예제 #9
0
파일: main.cpp 프로젝트: litaoshao/waffles
void dropRandomValues(GArgReader& args)
{
	GMatrix* pData = loadData(args.pop_string());
	double portion = args.pop_double();

	// Parse the options
	unsigned int seed = getpid() * (unsigned int)time(NULL);
	while(args.next_is_flag())
	{
		if(args.if_pop("-seed"))
			seed = args.pop_uint();
		else
			ThrowError("Invalid option: ", args.peek());
	}

	GRand rand(seed);
	size_t n = pData->rows() * pData->cols();
	size_t k = size_t(portion * n);
	for(size_t i = 0; i < pData->cols(); i++)
	{
		size_t vals = pData->relation()->valueCount(i);
		if(vals == 0)
		{
			for(size_t j = 0; j < pData->rows(); j++)
			{
				if(rand.next(n) < k)
				{
					pData->row(j)[i] = UNKNOWN_REAL_VALUE;
					k--;
				}
				n--;
			}
		}
		else
		{
			for(size_t j = 0; j < pData->rows(); j++)
			{
				if(rand.next(n) < k)
				{
					pData->row(j)[i] = UNKNOWN_DISCRETE_VALUE;
					k--;
				}
				n--;
			}
		}
	}
	pData->print(cout);
}
예제 #10
0
void GLinearRegressor::refine(GMatrix& features, GMatrix& labels, double learningRate, size_t epochs, double learningRateDecayFactor)
{
	size_t fDims = features.cols();
	size_t lDims = labels.cols();
	size_t* pIndexes = new size_t[features.rows()];
	ArrayHolder<size_t> hIndexes(pIndexes);
	GIndexVec::makeIndexVec(pIndexes, features.rows());
	for(size_t i = 0; i < epochs; i++)
	{
		GIndexVec::shuffle(pIndexes, features.rows(), &m_rand);
		size_t* pIndex = pIndexes;
		for(size_t j = 0; j < features.rows(); j++)
		{
			double* pFeat = features[*pIndex];
			double* pLab = labels[*pIndex];
			double* pBias = m_pEpsilon;
			for(size_t k = 0; k < lDims; k++)
			{
				double err = *pLab - (GVec::dotProduct(pFeat, m_pBeta->row(k), fDims) + *pBias);
				double* pF = pFeat;
				double lr = learningRate;
				double mag = 0.0;
				for(size_t l = 0; l < fDims; l++)
				{
					double d = *pF * err;
					mag += (d * d);
					pF++;
				}
				mag += err * err;
				if(mag > 1.0)
					lr /= mag;
				pF = pFeat;
				double* pW = m_pBeta->row(k);
				for(size_t l = 0; l < fDims; l++)
				{
					*pW += *pF * lr * err;
					pF++;
					pW++;
				}
				*pBias += learningRate * err;
				pLab++;
				pBias++;
			}
			pIndex++;
		}
		learningRate *= learningRateDecayFactor;
	}
}
예제 #11
0
파일: main.cpp 프로젝트: litaoshao/waffles
void autoCorrelation(GArgReader& args)
{
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	size_t lag = std::min((size_t)256, pData->rows() / 2);
	size_t dims = pData->cols();
	GTEMPBUF(double, mean, dims);
	pData->centroid(mean);
	GMatrix ac(0, dims + 1);
	for(size_t i = 1; i <= lag; i++)
	{
		double* pRow = ac.newRow();
		*(pRow++) = (double)i;
		for(size_t j = 0; j < dims; j++)
		{
			*pRow = 0;
			size_t k;
			for(k = 0; k + i < pData->rows(); k++)
			{
				double* pA = pData->row(k);
				double* pB = pData->row(k + i);
				*pRow += (pA[j] - mean[j]) * (pB[j] - mean[j]);
			}
			*pRow /= k;
			pRow++;
		}
	}
	ac.print(cout);
}
예제 #12
0
파일: main.cpp 프로젝트: litaoshao/waffles
void addNoise(GArgReader& args)
{
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	double dev = args.pop_double();

	// Parse the options
	unsigned int seed = getpid() * (unsigned int)time(NULL);
	int excludeLast = 0;
	while(args.next_is_flag())
	{
		if(args.if_pop("-seed"))
			seed = args.pop_uint();
		else if(args.if_pop("-excludelast"))
			excludeLast = args.pop_uint();
		else
			ThrowError("Invalid neighbor finder option: ", args.peek());
	}

	GRand prng(seed);
	size_t cols = pData->cols() - excludeLast;
	for(size_t r = 0; r < pData->rows(); r++)
	{
		double* pRow = pData->row(r);
		for(size_t c = 0; c < cols; c++)
			*(pRow++) += dev * prng.normal();
	}
	pData->print(cout);
}
예제 #13
0
파일: main.cpp 프로젝트: kslazarev/waffles
void curviness2(GArgReader& args)
{
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	GNormalize norm;
	GMatrix* pDataNormalized = norm.doit(*pData);
	Holder<GMatrix> hDataNormalized(pDataNormalized);
	hData.reset();
	pData = NULL;

	// Parse Options
	size_t maxEigs = 10;
	unsigned int seed = getpid() * (unsigned int)time(NULL);
	Holder<GMatrix> hControlData(NULL);
	while(args.size() > 0)
	{
		if(args.if_pop("-seed"))
			seed = args.pop_uint();
		else if(args.if_pop("-maxeigs"))
			maxEigs = args.pop_uint();
		else
			throw Ex("Invalid option: ", args.peek());
	}

	GRand rand(seed);
	size_t targetDims = std::min(maxEigs, pDataNormalized->cols());

	// Do linear PCA
	GNeuroPCA np1(targetDims, &rand);
	np1.setActivation(new GActivationIdentity());
	np1.computeEigVals();
	GMatrix* pResults1 = np1.doit(*pDataNormalized);
	Holder<GMatrix> hResults1(pResults1);
	double* pEigVals1 = np1.eigVals();
	for(size_t i = 0; i + 1 < targetDims; i++)
		pEigVals1[i] = sqrt(pEigVals1[i]) - sqrt(pEigVals1[i + 1]);
	size_t max1 = GVec::indexOfMax(pEigVals1, targetDims - 1, &rand);
	double v1 = (double)max1;
	if(max1 > 0 && max1 + 2 < targetDims)
		v1 += (pEigVals1[max1 - 1] - pEigVals1[max1 + 1]) / (2.0 * (pEigVals1[max1 - 1] + pEigVals1[max1 + 1] - 2.0 * pEigVals1[max1]));

	// Do non-linear PCA
	GNeuroPCA np2(targetDims, &rand);
	np1.setActivation(new GActivationLogistic());
	np2.computeEigVals();
	GMatrix* pResults2 = np2.doit(*pDataNormalized);
	Holder<GMatrix> hResults2(pResults2);
	double* pEigVals2 = np2.eigVals();
	for(size_t i = 0; i + 1 < targetDims; i++)
		pEigVals2[i] = sqrt(pEigVals2[i]) - sqrt(pEigVals2[i + 1]);
	size_t max2 = GVec::indexOfMax(pEigVals2, targetDims - 1, &rand);
	double v2 = (double)max2;
	if(max2 > 0 && max2 + 2 < targetDims)
		v2 += (pEigVals2[max2 - 1] - pEigVals2[max2 + 1]) / (2.0 * (pEigVals2[max2 - 1] + pEigVals2[max2 + 1] - 2.0 * pEigVals2[max2]));

	// Compute the difference in where the eigenvalues fall
	cout.precision(14);
	cout << (v1 - v2) << "\n";
}
예제 #14
0
파일: main.cpp 프로젝트: kslazarev/waffles
void blendEmbeddings(GArgReader& args)
{
	// Load the files and params
	GMatrix* pDataOrig = loadData(args.pop_string());
	Holder<GMatrix> hDataOrig(pDataOrig);
	unsigned int seed = getpid() * (unsigned int)time(NULL);
	GRand prng(seed);
	GNeighborFinder* pNF = instantiateNeighborFinder(pDataOrig, &prng, args);
	Holder<GNeighborFinder> hNF(pNF);
	GMatrix* pDataA = loadData(args.pop_string());
	Holder<GMatrix> hDataA(pDataA);
	GMatrix* pDataB = loadData(args.pop_string());
	Holder<GMatrix> hDataB(pDataB);
	if(pDataA->rows() != pDataOrig->rows() || pDataB->rows() != pDataOrig->rows())
		throw Ex("mismatching number of rows");
	if(pDataA->cols() != pDataB->cols())
		throw Ex("mismatching number of cols");

	// Parse Options
	while(args.size() > 0)
	{
		if(args.if_pop("-seed"))
			prng.setSeed(args.pop_uint());
		else
			throw Ex("Invalid option: ", args.peek());
	}

	// Get a neighbor table
	if(!pNF->isCached())
	{
		GNeighborFinderCacheWrapper* pNF2 = new GNeighborFinderCacheWrapper(hNF.release(), true);
		hNF.reset(pNF2);
		pNF = pNF2;
	}
	((GNeighborFinderCacheWrapper*)pNF)->fillCache();
	size_t* pNeighborTable = ((GNeighborFinderCacheWrapper*)pNF)->cache();

	// Do the blending
	size_t startPoint = (size_t)prng.next(pDataA->rows());
	double* pRatios = new double[pDataA->rows()];
	ArrayHolder<double> hRatios(pRatios);
	GVec::setAll(pRatios, 0.5, pDataA->rows());
	GMatrix* pDataC = GManifold::blendEmbeddings(pDataA, pRatios, pDataB, pNF->neighborCount(), pNeighborTable, startPoint);
	Holder<GMatrix> hDataC(pDataC);
	pDataC->print(cout);
}
예제 #15
0
void test_transform_mergevert()
{
	// Make some input files
	TempFileMaker tempFile1("a.arff",
		"@RELATION test\n"
		"@ATTRIBUTE a1 continuous\n"
		"@ATTRIBUTE a2 { alice, bob }\n"
		"@ATTRIBUTE a3 { true, false }\n"
		"@DATA\n"
		"1.2, alice, true\n"
		"2.3, bob, false\n"
		);
	TempFileMaker tempFile2("b.arff",
		"@RELATION test\n"
		"@ATTRIBUTE a1 continuous\n"
		"@ATTRIBUTE a2 { charlie, bob }\n"
		"@ATTRIBUTE a3 { false, true }\n"
		"@DATA\n"
		"3.4, bob, true\n"
		"4.5, charlie, false\n"
		);

	// Execute the command
	GPipe pipeStdOut;
	if(sysExec("waffles_transform", "mergevert a.arff b.arff", &pipeStdOut) != 0)
		throw Ex("exit status indicates failure");
	char buf[512];
	size_t len = pipeStdOut.read(buf, 512);
	if(len == 512)
		throw Ex("need a bigger buffer");
	buf[len] = '\0';

	// Check the results
	GMatrix M;
	M.parseArff(buf, strlen(buf));
	if(M.rows() != 4 || M.cols() != 3)
		throw Ex("failed");
	if(M.relation().valueCount(0) != 0)
		throw Ex("failed");
	if(M.relation().valueCount(1) != 3)
		throw Ex("failed");
	if(M.relation().valueCount(2) != 2)
		throw Ex("failed");
	std::ostringstream oss;
	const GArffRelation* pRel = (const GArffRelation*)&M.relation();
	pRel->printAttrValue(oss, 1, 2.0);
	string s = oss.str();
	if(strcmp(s.c_str(), "charlie") != 0)
		throw Ex("failed");
	if(M[0][0] != 1.2 || M[1][0] != 2.3 || M[2][0] != 3.4 || M[3][0] != 4.5)
		throw Ex("failed");
	if(M[0][1] != 0 || M[1][1] != 1 || M[2][1] != 1 || M[3][1] != 2)
		throw Ex("failed");
	if(M[0][2] != 0 || M[1][2] != 1 || M[2][2] != 0 || M[3][2] != 1)
		throw Ex("failed");
}
예제 #16
0
파일: main.cpp 프로젝트: litaoshao/waffles
void rotate(GArgReader& args)
{
	GMatrix* pA = loadData(args.pop_string());
	Holder<GMatrix> hA(pA);
	sp_relation relation = pA->relation();
	unsigned colx = args.pop_uint();
	if(colx >= pA->cols()){
	  ThrowError("Rotation first column index (",to_str(colx),") "
		     "should not be greater "
		     "than the largest index, which is ", to_str(pA->cols()-1),
		     ".");
	}
	if(!relation->areContinuous(colx,1)){
	  ThrowError("Rotation first column index (",to_str(colx),") "
		     "should be continuous and it is not.");
		     
	}
	unsigned coly = args.pop_uint();
	if(coly >= pA->cols()){
	  ThrowError("Rotation second column index (",to_str(coly),") "
		     "should not be greater "
		     "than the largest index, which is ", to_str(pA->cols()-1),
		     ".");
	}
	if(!relation->areContinuous(coly,1)){
	  ThrowError("Rotation second column index (",to_str(coly),") "
		     "should be continuous and it is not.");
	}
	
	double angle = args.pop_double();

	angle = angle * M_PI / 180; //Convert from degrees to radians
	double cosAngle = std::cos(angle);
	double sinAngle = std::sin(angle);
	for(std::size_t rowIdx = 0; rowIdx < pA->rows(); ++rowIdx){
	  double* row = (*pA)[rowIdx];
	  double x = row[colx];
	  double y = row[coly];
	  row[colx]=x*cosAngle-y*sinAngle;
	  row[coly]=x*sinAngle+y*cosAngle;
	}
	pA->print(cout);
}
예제 #17
0
// virtual
void GBayesianModelCombination::determineWeights(GMatrix& features, GMatrix& labels)
{
	double* pWeights = new double[m_models.size()];
	ArrayHolder<double> hWeights(pWeights);
	GVec::setAll(pWeights, 0.0, m_models.size());
	double sumWeight = 0.0;
	double maxLogProb = -1e38;
	GTEMPBUF(double, results, labels.cols());
	for(size_t i = 0; i < m_samples; i++)
	{
		// Set weights randomly from a dirichlet distribution with unifrom probabilities
		for(vector<GWeightedModel*>::iterator it = m_models.begin(); it != m_models.end(); it++)
			(*it)->m_weight = m_rand.exponential();
		normalizeWeights();

		// Evaluate accuracy
		accuracy(features, labels, results);
		double d = GVec::sumElements(results, labels.cols()) / labels.cols();
		double logProbEnsembleGivenData;
		if(d == 0.0)
			logProbEnsembleGivenData = -1e38;
		else if(d == 1.0)
			logProbEnsembleGivenData = 0.0;
		else
			logProbEnsembleGivenData = features.rows() * (d * log(d) + (1.0 - d) * log(1.0 - d));

		// Update the weights
		if(logProbEnsembleGivenData > maxLogProb)
		{
			GVec::multiply(pWeights, exp(maxLogProb - logProbEnsembleGivenData), m_models.size());
			maxLogProb = logProbEnsembleGivenData;
		}
		double w = exp(logProbEnsembleGivenData - maxLogProb);
		GVec::multiply(pWeights, sumWeight / (sumWeight + w), m_models.size());
		double* pW = pWeights;
		for(vector<GWeightedModel*>::iterator it = m_models.begin(); it != m_models.end(); it++)
			*(pW++) += w * (*it)->m_weight;
		sumWeight += w;
	}
	double* pW = pWeights;
	for(vector<GWeightedModel*>::iterator it = m_models.begin(); it != m_models.end(); it++)
		(*it)->m_weight = *(pW++);
}
예제 #18
0
void test_recommend_fillmissingvalues()
{
	// Make some input files
	TempFileMaker tempFile1("a.arff",
		"@RELATION test\n"
		"@ATTRIBUTE a1 { a, b, c }\n"
		"@ATTRIBUTE a2 continuous\n"
		"@ATTRIBUTE a3 { d, e, f }\n"
		"@ATTRIBUTE a4 { g, h, i }\n"
		"@DATA\n"
		"a, ?, f, i\n"
		"?, 2, ?, i\n"
		"b, ?, d, ?\n"
		"?, 4, ?, ?\n"
		"?, ?, e, g\n"
		"?, ?, e, ?\n"
		"a, ?, ?, h\n"
		"\n"
		);

	// Execute the command
	GPipe pipeStdOut;
	if(sysExec("waffles_recommend", "fillmissingvalues a.arff baseline", &pipeStdOut) != 0)
		throw Ex("exit status indicates failure");
	char buf[512];
	size_t len = pipeStdOut.read(buf, 512);
	if(len == 512)
		throw Ex("need a bigger buffer");
	buf[len] = '\0';

	// Check the results
	GMatrix M;
	M.parseArff(buf, strlen(buf));
	if(M.rows() != 7 || M.cols() != 4)
		throw Ex("failed");
	if(M[0][0] != 0)
		throw Ex("failed");
	if(M[0][1] != 3)
		throw Ex("failed");
	if(M[1][1] != 2)
		throw Ex("failed");
	if(M[2][1] != 3)
		throw Ex("failed");
	if(M[3][3] != 2)
		throw Ex("failed");
	if(M[4][0] != 0)
		throw Ex("failed");
	if(M[5][1] != 3)
		throw Ex("failed");
	if(M[6][2] != 1)
		throw Ex("failed");
	if(M[6][3] != 1)
		throw Ex("failed");
}
예제 #19
0
// virtual
void GPolynomial::trainInner(GMatrix& features, GMatrix& labels)
{
	GMatrix labelCol(labels.rows(), 1);
	clear();
	for(size_t i = 0; i < labels.cols(); i++)
	{
		GPolynomialSingleLabel* pPSL = new GPolynomialSingleLabel(m_controlPoints);
		m_polys.push_back(pPSL);
		labelCol.copyColumns(0, &labels, i, 1);
		pPSL->train(features, labelCol);
	}
}
예제 #20
0
파일: main.cpp 프로젝트: litaoshao/waffles
void transition(GArgReader& args)
{
	// Load the input data
	GMatrix* pActions = loadData(args.pop_string());
	Holder<GMatrix> hActions(pActions);
	GMatrix* pState = loadData(args.pop_string());
	Holder<GMatrix> hState(pState);
	if(pState->rows() != pActions->rows())
		ThrowError("Expected the same number of rows in both datasets");

	// Parse options
	bool delta = false;
	while(args.size() > 0)
	{
		if(args.if_pop("-delta"))
			delta = true;
		else
			ThrowError("Invalid option: ", args.peek());
	}

	// Make the output data
	size_t actionDims = pActions->cols();
	size_t stateDims = pState->cols();
	GMixedRelation* pRelation = new GMixedRelation();
	sp_relation pRel = pRelation;
	pRelation->addAttrs(pActions->relation().get());
	pRelation->addAttrs(stateDims + stateDims, 0);
	GMatrix* pTransition = new GMatrix(pRel);
	pTransition->newRows(pActions->rows() - 1);
	for(size_t i = 0; i < pActions->rows() - 1; i++)
	{
		double* pOut = pTransition->row(i);
		GVec::copy(pOut, pActions->row(i), actionDims);
		GVec::copy(pOut + actionDims, pState->row(i), stateDims);
		GVec::copy(pOut + actionDims + stateDims, pState->row(i + 1), stateDims);
		if(delta)
			GVec::subtract(pOut + actionDims + stateDims, pState->row(i), stateDims);
	}
	pTransition->print(cout);
}
예제 #21
0
파일: main.cpp 프로젝트: litaoshao/waffles
void dropColumns(GArgReader& args)
{
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	vector<size_t> colList;
	size_t attrCount = pData->cols();
	parseAttributeList(colList, args, attrCount);
	std::sort(colList.begin(), colList.end());
	std::reverse(colList.begin(), colList.end());
	for(size_t i = 0; i < colList.size(); i++)
		pData->deleteColumn(colList[i]);
	pData->print(cout);
}
예제 #22
0
파일: GLinear.cpp 프로젝트: b2020b/waffles
void GLinearRegressor::refine(const GMatrix& features, const GMatrix& labels, double learningRate, size_t epochs, double learningRateDecayFactor)
{
	size_t fDims = features.cols();
	size_t lDims = labels.cols();
	size_t* pIndexes = new size_t[features.rows()];
	ArrayHolder<size_t> hIndexes(pIndexes);
	GIndexVec::makeIndexVec(pIndexes, features.rows());
	for(size_t i = 0; i < epochs; i++)
	{
		GIndexVec::shuffle(pIndexes, features.rows(), &m_rand);
		size_t* pIndex = pIndexes;
		for(size_t j = 0; j < features.rows(); j++)
		{
			const GVec& feat = features[*pIndex];
			const GVec& lab = labels[*pIndex];
			for(size_t k = 0; k < lDims; k++)
			{
				double err = lab[k] - (feat.dotProduct(m_pBeta->row(k)) + m_epsilon[k]);
				double lr = learningRate;
				double mag = 0.0;
				for(size_t l = 0; l < fDims; l++)
				{
					double d = feat[l] * err;
					mag += (d * d);
				}
				mag += err * err;
				if(mag > 1.0)
					lr /= mag;
				GVec& w = m_pBeta->row(k);
				for(size_t l = 0; l < fDims; l++)
					w[l] += feat[l] * lr * err;
				m_epsilon[k] += learningRate * err;
			}
			pIndex++;
		}
		learningRate *= learningRateDecayFactor;
	}
}
예제 #23
0
파일: main.cpp 프로젝트: litaoshao/waffles
///TODO: this command should be documented
void center(GArgReader& args)
{
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	unsigned int r = args.pop_uint();
	size_t cols = pData->cols();
	double* pRow = pData->row(r);
	for(size_t i = 0; i < r; ++i)
		GVec::subtract(pData->row(i), pRow, cols);
	for(size_t i = r + 1; i < pData->rows(); ++i)
		GVec::subtract(pData->row(i), pRow, cols);
	GVec::setAll(pRow, 0.0, cols);
	pData->print(cout);
}
예제 #24
0
파일: main.cpp 프로젝트: litaoshao/waffles
void shiftColumns(GArgReader& args)
{
	GMatrix* pA = loadData(args.pop_string());
	Holder<GMatrix> hA(pA);
	vector<size_t> cols;
	parseAttributeList(cols, args, pA->cols());
	double offset = args.pop_double();
	for(size_t i = 0; i < pA->rows(); i++)
	{
		double* pRow = pA->row(i);
		for(vector<size_t>::iterator it = cols.begin(); it != cols.end(); it++)
			pRow[*it] += offset;
	}
	pA->print(cout);
}
예제 #25
0
// virtual
void GBayesianModelAveraging::determineWeights(GMatrix& features, GMatrix& labels)
{
	GTEMPBUF(double, results, labels.cols());
	double m = -1e38;
	for(vector<GWeightedModel*>::iterator it = m_models.begin(); it != m_models.end(); it++)
	{
		(*it)->m_pModel->accuracy(features, labels, results);
		double d = GVec::sumElements(results, labels.cols()) / labels.cols();
		double logProbHypothGivenData;
		if(d == 0.0)
			logProbHypothGivenData = -1e38;
		else if(d == 1.0)
			logProbHypothGivenData = 0.0;
		else
			logProbHypothGivenData = features.rows() * (d * log(d) + (1.0 - d) * log(1.0 - d));
		m = std::max(m, logProbHypothGivenData);
		(*it)->m_weight = logProbHypothGivenData;
	}
	for(vector<GWeightedModel*>::iterator it = m_models.begin(); it != m_models.end(); it++)
	{
		double logProbHypothGivenData = (*it)->m_weight;
		(*it)->m_weight = exp(logProbHypothGivenData - m);
	}
}
예제 #26
0
파일: main.cpp 프로젝트: litaoshao/waffles
void cumulativeColumns(GArgReader& args)
{
	GMatrix* pA = loadData(args.pop_string());
	Holder<GMatrix> hA(pA);
	vector<size_t> cols;
	parseAttributeList(cols, args, pA->cols());
	double* pPrevRow = pA->row(0);
	for(size_t i = 1; i < pA->rows(); i++)
	{
		double* pRow = pA->row(i);
		for(vector<size_t>::iterator it = cols.begin(); it != cols.end(); it++)
			pRow[*it] += pPrevRow[*it];
		pPrevRow = pRow;
	}
	pA->print(cout);
}
예제 #27
0
void GGaussianProcess::trainInnerInner(const GMatrix& features, const GMatrix& labels)
{
	clear();
	GMatrix* pL;
	{
		// Compute the kernel matrix
		GMatrix k(features.rows(), features.rows());
		for(size_t i = 0; i < features.rows(); i++)
		{
			GVec& row = k[i];
			const GVec& a = features[i];
			for(size_t j = 0; j < features.rows(); j++)
			{
				const GVec& b = features[j];
				row[j] = m_weightsPriorVar * m_pKernel->apply(a, b);
			}
		}

		// Add the noise variance to the diagonal of the kernel matrix
		for(size_t i = 0; i < features.rows(); i++)
			k[i][i] += m_noiseVar;

		// Compute L
		pL = k.cholesky(true);
	}
	std::unique_ptr<GMatrix> hL(pL);

	// Compute the model
	m_pLInv = pL->pseudoInverse();
	GMatrix* pTmp = GMatrix::multiply(*m_pLInv, labels, false, false);
	std::unique_ptr<GMatrix> hTmp(pTmp);
	GMatrix* pLTrans = pL->transpose();
	std::unique_ptr<GMatrix> hLTrans(pLTrans);
	GMatrix* pLTransInv = pLTrans->pseudoInverse();
	std::unique_ptr<GMatrix> hLTransInv(pLTransInv);
	m_pAlpha = GMatrix::multiply(*pLTransInv, *pTmp, false, false);
	GAssert(m_pAlpha->rows() == features.rows());
	GAssert(m_pAlpha->cols() == labels.cols());
	m_pStoredFeatures = new GMatrix();
	m_pStoredFeatures->copy(&features);
}
예제 #28
0
GSparseMatrix* GRecommenderLib::loadSparseData(const char* szFilename)
{
	// Load the dataset by extension
	PathData pd;
	GFile::parsePath(szFilename, &pd);
	if(_stricmp(szFilename + pd.extStart, ".arff") == 0)
	{
		// Convert a 3-column dense ARFF file to a sparse matrix
		GMatrix data;
		data.loadArff(szFilename);
		if(data.cols() != 3)
			throw Ex("Expected 3 columns: 0) user or row-index, 1) item or col-index, 2) value or rating");
		double m0 = data.columnMin(0);
		double r0 = data.columnMax(0) - m0;
		double m1 = data.columnMin(1);
		double r1 = data.columnMax(1) - m1;
		if(m0 < 0 || m0 > 1e10 || r0 < 2 || r0 > 1e10)
			throw Ex("Invalid row indexes");
		if(m1 < 0 || m1 > 1e10 || r1 < 2 || r1 > 1e10)
			throw Ex("Invalid col indexes");
		GSparseMatrix* pMatrix = new GSparseMatrix(size_t(m0 + r0) + 1, size_t(m1 + r1) + 1, UNKNOWN_REAL_VALUE);
		std::unique_ptr<GSparseMatrix> hMatrix(pMatrix);
		for(size_t i = 0; i < data.rows(); i++)
		{
			GVec& row = data.row(i);
			pMatrix->set(size_t(row[0]), size_t(row[1]), row[2]);
		}
		return hMatrix.release();
	}
	else if(_stricmp(szFilename + pd.extStart, ".sparse") == 0)
	{
		GDom doc;
		doc.loadJson(szFilename);
		return new GSparseMatrix(doc.root());
	}
	throw Ex("Unsupported file format: ", szFilename + pd.extStart);
	return NULL;
}
예제 #29
0
파일: main.cpp 프로젝트: litaoshao/waffles
GSparseMatrix* loadSparseData(const char* szFilename)
{
	// Load the dataset by extension
	PathData pd;
	GFile::parsePath(szFilename, &pd);
	if(_stricmp(szFilename + pd.extStart, ".arff") == 0)
	{
		// Convert a 3-column dense ARFF file to a sparse matrix
		GMatrix* pData = GMatrix::loadArff(szFilename);
		if(pData->cols() != 3)
			ThrowError("Expected 3 columns: 0) user or row-index, 1) item or col-index, 2) value or rating");
		double m0, r0, m1, r1;
		pData->minAndRange(0, &m0, &r0);
		pData->minAndRange(1, &m1, &r1);
		if(m0 < 0 || m0 > 1e10 || r0 < 2 || r0 > 1e10)
			ThrowError("Invalid row indexes");
		if(m1 < 0 || m1 > 1e10 || r1 < 2 || r1 > 1e10)
			ThrowError("Invalid col indexes");
		GSparseMatrix* pMatrix = new GSparseMatrix(size_t(m0 + r0) + 1, size_t(m1 + r1) + 1, UNKNOWN_REAL_VALUE);
		Holder<GSparseMatrix> hMatrix(pMatrix);
		for(size_t i = 0; i < pData->rows(); i++)
		{
			double* pRow = pData->row(i);
			pMatrix->set(size_t(pRow[0]), size_t(pRow[1]), pRow[2]);
		}
		return hMatrix.release();
	}
	else if(_stricmp(szFilename + pd.extStart, ".sparse") == 0)
	{
		GDom doc;
		doc.loadJson(szFilename);
		return new GSparseMatrix(doc.root());
	}
	ThrowError("Unsupported file format: ", szFilename + pd.extStart);
	return NULL;
}
예제 #30
0
// virtual
void GResamplingAdaBoost::trainInnerInner(const GMatrix& features, const GMatrix& labels)
{
	clear();

	// Initialize all instances with uniform weights
	GVec pDistribution(features.rows());
	pDistribution.fill(1.0 / features.rows());
	size_t drawRows = size_t(m_trainSize * features.rows());
	size_t* pDrawnIndexes = new size_t[drawRows];
	std::unique_ptr<size_t[]> hDrawnIndexes(pDrawnIndexes);

	// Train the ensemble
	size_t labelDims = labels.cols();
	double penalty = 1.0 / labelDims;
	GVec prediction(labelDims);
	for(size_t es = 0; es < m_ensembleSize; es++)
	{
		// Draw a training set from the distribution
		GCategoricalSamplerBatch csb(features.rows(), pDistribution, m_rand);
		csb.draw(drawRows, pDrawnIndexes);
		GMatrix drawnFeatures(features.relation().clone());
		GReleaseDataHolder hDrawnFeatures(&drawnFeatures);
		GMatrix drawnLabels(labels.relation().clone());
		GReleaseDataHolder hDrawnLabels(&drawnLabels);
		size_t* pIndex = pDrawnIndexes;
		for(size_t i = 0; i < drawRows; i++)
		{
			drawnFeatures.takeRow((GVec*)&features[*pIndex]);
			drawnLabels.takeRow((GVec*)&labels[*pIndex]);
			pIndex++;
		}

		// Train an instance of the model and store a clone of it
		m_pLearner->train(drawnFeatures, drawnLabels);
		GDom doc;
		GSupervisedLearner* pClone = m_pLoader->loadLearner(m_pLearner->serialize(&doc));

		// Compute model weight
		double err = 0.5;
		for(size_t i = 0; i < features.rows(); i++)
		{
			pClone->predict(features[i], prediction);
			const GVec& target = labels[i];
			for(size_t j = 0; j < labelDims; j++)
			{
				if((int)target[j] != (int)prediction[j])
					err += penalty;
			}
		}
		err /= features.rows();
		if(err >= 0.5)
		{
			delete(pClone);
			break;
		}
		double weight = 0.5 * log((1.0 - err) / err);
		m_models.push_back(new GWeightedModel(weight, pClone));

		// Update the distribution to favor mis-classified instances
		for(size_t i = 0; i < features.rows(); i++)
		{
			err = 0.0;
			pClone->predict(features[i], prediction);
			const GVec& target = labels[i];
			for(size_t j = 0; j < labelDims; j++)
			{
				if((int)target[j] != (int)prediction[j])
					err += penalty;
			}
			err /= labelDims;
			pDistribution[i] *= exp(weight * (err * 2.0 - 1.0));
		}
		pDistribution.sumToOne();
	}
	normalizeWeights();
}