예제 #1
0
// virtual
void GGaussianProcess::trainInner(const GMatrix& features, const GMatrix& labels)
{
	if(!features.relation().areContinuous())
		throw Ex("GGaussianProcess only supports continuous features. Perhaps you should wrap it in a GAutoFilter.");
	if(!labels.relation().areContinuous())
		throw Ex("GGaussianProcess only supports continuous labels. Perhaps you should wrap it in a GAutoFilter.");
	if(features.rows() <= m_maxSamples)
	{
		trainInnerInner(features, labels);
		return;
	}
	GMatrix f(features.relation().clone());
	GReleaseDataHolder hF(&f);
	GMatrix l(labels.relation().clone());
	GReleaseDataHolder hL(&l);
	for(size_t i = 0; i < features.rows(); i++)
	{
		f.takeRow((GVec*)&features[i]);
		l.takeRow((GVec*)&labels[i]);
	}
	while(f.rows() > m_maxSamples)
	{
		size_t i = (size_t)m_rand.next(f.rows());
		f.releaseRow(i);
		l.releaseRow(i);
	}
	trainInnerInner(f, l);
}
예제 #2
0
파일: main.cpp 프로젝트: litaoshao/waffles
void enumerateValues(GArgReader& args)
{
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	size_t col = args.pop_uint();
	if(pData->relation()->valueCount(col) > 0)
		((GArffRelation*)pData->relation().get())->setAttrValueCount(col, 0);
	else
	{
		size_t n = 0;
		map<double,size_t> themap;
		for(size_t i = 0; i < pData->rows(); i++)
		{
			double* pRow = pData->row(i);
			map<double,size_t>::iterator it = themap.find(pRow[col]);
			if(it == themap.end())
			{
				themap[pRow[col]] = n;
				pRow[col] = (double)n;
				n++;
			}
			else
				pRow[col] = (double)it->second;
		}
	}
	pData->print(cout);
}
예제 #3
0
파일: GLinear.cpp 프로젝트: b2020b/waffles
// virtual
void GLinearRegressor::trainInner(const GMatrix& features, const GMatrix& labels)
{
	if(!features.relation().areContinuous())
		throw Ex("GLinearRegressor only supports continuous features. Perhaps you should wrap it in a GAutoFilter.");
	if(!labels.relation().areContinuous())
		throw Ex("GLinearRegressor only supports continuous labels. Perhaps you should wrap it in a GAutoFilter.");

	// Use a fast, but not-very-numerically-stable technique to compute an initial approximation for beta and epsilon
	clear();
	GMatrix* pAll = GMatrix::mergeHoriz(&features, &labels);
	Holder<GMatrix> hAll(pAll);
	GPCA pca(features.cols());
	pca.train(*pAll);
	size_t inputs = features.cols();
	size_t outputs = labels.cols();
	GMatrix f(inputs, inputs);
	GMatrix l(inputs, outputs);
	for(size_t i = 0; i < inputs; i++)
	{
		GVec::copy(f[i].data(), pca.basis()->row(i).data(), inputs);
		double sqmag = f[i].squaredMagnitude();
		if(sqmag > 1e-10)
			f[i] *= 1.0 / sqmag;
		l[i].set(pca.basis()->row(i).data() + inputs, outputs);
	}
	m_pBeta = GMatrix::multiply(l, f, true, false);
	m_epsilon.resize(outputs);
	GVecWrapper vw(pca.centroid().data(), m_pBeta->cols());
	m_pBeta->multiply(vw.vec(), m_epsilon, false);
	m_epsilon *= -1.0;
	GVec::add(m_epsilon.data(), pca.centroid().data() + inputs, outputs);

	// Refine the results using gradient descent
	refine(features, labels, 0.06, 20, 0.75);
}
예제 #4
0
파일: main.cpp 프로젝트: litaoshao/waffles
void splitClass(GArgReader& args)
{
	const char* filename = args.pop_string();
	GMatrix* pData = loadData(filename);
	Holder<GMatrix> hData(pData);
	size_t classAttr = args.pop_uint();
	
	bool dropClass = false;
	while(args.size() > 0)
	{
		if(args.if_pop("-dropclass"))
			dropClass = true;
		else
			ThrowError("Invalid option: ", args.peek());
	}

	for(size_t i = 0; i < pData->relation()->valueCount(classAttr); i++)
	{
		GMatrix tmp(pData->relation(), pData->heap());
		pData->splitByNominalValue(&tmp, classAttr, i);
		std::ostringstream oss;
		PathData pd;
		GFile::parsePath(filename, &pd);
		string fn;
		fn.assign(filename + pd.fileStart, pd.extStart - pd.fileStart);
		oss << fn << "_";
		pData->relation()->printAttrValue(oss, classAttr, (double)i);
		oss << ".arff";
		string s = oss.str();
		if(dropClass)
			tmp.deleteColumn(classAttr);
		tmp.saveArff(s.c_str());
	}
}
예제 #5
0
파일: main.cpp 프로젝트: litaoshao/waffles
void fillMissingValues(GArgReader& args)
{
	// Load
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);

	// Parse options
	unsigned int nSeed = getpid() * (unsigned int)time(NULL);
	bool random = false;
	while(args.size() > 0)
	{
		if(args.if_pop("-seed"))
			nSeed = args.pop_uint();
		else if(args.if_pop("-random"))
			random = true;
		else
			ThrowError("Invalid option: ", args.peek());
	}

	// Replace missing values and print
	GRand prng(nSeed);
	if(random)
	{
		for(size_t i = 0; i < pData->relation()->size(); i++)
			pData->replaceMissingValuesRandomly(i, &prng);
	}
	else
	{
		for(size_t i = 0; i < pData->relation()->size(); i++)
			pData->replaceMissingValuesWithBaseline(i);
	}
	pData->print(cout);
}
예제 #6
0
void test_transform_mergevert()
{
	// Make some input files
	TempFileMaker tempFile1("a.arff",
		"@RELATION test\n"
		"@ATTRIBUTE a1 continuous\n"
		"@ATTRIBUTE a2 { alice, bob }\n"
		"@ATTRIBUTE a3 { true, false }\n"
		"@DATA\n"
		"1.2, alice, true\n"
		"2.3, bob, false\n"
		);
	TempFileMaker tempFile2("b.arff",
		"@RELATION test\n"
		"@ATTRIBUTE a1 continuous\n"
		"@ATTRIBUTE a2 { charlie, bob }\n"
		"@ATTRIBUTE a3 { false, true }\n"
		"@DATA\n"
		"3.4, bob, true\n"
		"4.5, charlie, false\n"
		);

	// Execute the command
	GPipe pipeStdOut;
	if(sysExec("waffles_transform", "mergevert a.arff b.arff", &pipeStdOut) != 0)
		throw Ex("exit status indicates failure");
	char buf[512];
	size_t len = pipeStdOut.read(buf, 512);
	if(len == 512)
		throw Ex("need a bigger buffer");
	buf[len] = '\0';

	// Check the results
	GMatrix M;
	M.parseArff(buf, strlen(buf));
	if(M.rows() != 4 || M.cols() != 3)
		throw Ex("failed");
	if(M.relation().valueCount(0) != 0)
		throw Ex("failed");
	if(M.relation().valueCount(1) != 3)
		throw Ex("failed");
	if(M.relation().valueCount(2) != 2)
		throw Ex("failed");
	std::ostringstream oss;
	const GArffRelation* pRel = (const GArffRelation*)&M.relation();
	pRel->printAttrValue(oss, 1, 2.0);
	string s = oss.str();
	if(strcmp(s.c_str(), "charlie") != 0)
		throw Ex("failed");
	if(M[0][0] != 1.2 || M[1][0] != 2.3 || M[2][0] != 3.4 || M[3][0] != 4.5)
		throw Ex("failed");
	if(M[0][1] != 0 || M[1][1] != 1 || M[2][1] != 1 || M[3][1] != 2)
		throw Ex("failed");
	if(M[0][2] != 0 || M[1][2] != 1 || M[2][2] != 0 || M[3][2] != 1)
		throw Ex("failed");
}
예제 #7
0
// virtual
void GNaiveBayes::trainInner(const GMatrix& features, const GMatrix& labels)
{
	if(!features.relation().areNominal())
		throw Ex("GNaiveBayes only supports nominal features. Perhaps you should wrap it in a GAutoFilter.");
	if(!labels.relation().areNominal())
		throw Ex("GNaiveBayes only supports nominal labels. Perhaps you should wrap it in a GAutoFilter.");
	beginIncrementalLearningInner(features.relation(), labels.relation());
	for(size_t n = 0; n < features.rows(); n++)
		trainIncremental(features[n], labels[n]);
}
예제 #8
0
// virtual
void GNaiveInstance::trainInner(const GMatrix& features, const GMatrix& labels)
{
	if(!features.relation().areContinuous())
		throw Ex("GNaiveInstance only supports continuous features. Perhaps you should wrap it in a GAutoFilter.");
	if(!labels.relation().areContinuous())
		throw Ex("GNaiveInstance only supports continuous labels. Perhaps you should wrap it in a GAutoFilter.");

	beginIncrementalLearningInner(features.relation(), labels.relation());
	for(size_t i = 0; i < features.rows(); i++)
		trainIncremental(features[i], labels[i]);
}
예제 #9
0
파일: GLinear.cpp 프로젝트: b2020b/waffles
// virtual
void GLinearDistribution::trainInner(const GMatrix& features, const GMatrix& labels)
{
	if(!features.relation().areContinuous())
		throw Ex("GLinearDistribution only supports continuous features. Perhaps you should wrap it in a GAutoFilter.");
	if(!labels.relation().areContinuous())
		throw Ex("GLinearDistribution only supports continuous labels. Perhaps you should wrap it in a GAutoFilter.");

	// Init A with the inverse of the weights prior covariance matrix
	size_t dims = features.cols();
	GMatrix a(dims, dims);
	a.setAll(0.0);

	// Init XY
	size_t labelDims = labels.cols();
	GMatrix xy(dims, labelDims);
	xy.setAll(0.0);

	// Train on each instance
	double w = 1.0 / (m_noiseDev * m_noiseDev);
	for(size_t i = 0; i < features.rows(); i++)
	{
		// Update A
		const GVec& feat = features[i];
		for(size_t j = 0; j < dims; j++)
		{
			GVec& el = a[j];
			for(size_t k = 0; k < dims; k++)
				el[k] += feat[j] * feat[k];
		}

		// Update XY
		const GVec& lab = labels[i];
		for(size_t j = 0; j < dims; j++)
		{
			GVec& el = xy[j];
			for(size_t k = 0; k < labelDims; k++)
				el[k] += feat[j] * lab[k];
		}
	}
	a.multiply(w);
	xy.multiply(w);

	// Compute final matrices
	clear();
	m_pAInv = a.pseudoInverse();
	GAssert(m_pAInv->cols() == dims);
	GAssert(m_pAInv->rows() == dims);
	m_pWBar = GMatrix::multiply(xy, *m_pAInv, true, true);
	GAssert(m_pWBar->cols() == dims);
	GAssert(m_pWBar->rows() == labelDims);
	m_buf.resize(dims);
}
예제 #10
0
	GBagTrainWorker(GMasterThread& master, GBag* pBag, const GMatrix& features, const GMatrix& labels, double trainSize, size_t seed)
	: GWorkerThread(master),
	m_pBag(pBag),
	m_features(features),
	m_labels(labels),
	m_drawnFeatures(features.relation().clone()),
	m_drawnLabels(labels.relation().clone()),
	m_rand(seed)
	{
		GAssert(m_features.rows() > 0);
		m_drawSize = size_t(trainSize * features.rows());
		m_drawnFeatures.reserve(m_drawSize);
		m_drawnLabels.reserve(m_drawSize);
	}
예제 #11
0
파일: main.cpp 프로젝트: litaoshao/waffles
void DropMissingValues(GArgReader& args)
{
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	GRelation* pRelation = pData->relation().get();
	size_t dims = pRelation->size();
	for(size_t i = pData->rows() - 1; i < pData->rows(); i--)
	{
		double* pPat = pData->row(i);
		bool drop = false;
		for(size_t j = 0; j < dims; j++)
		{
			if(pRelation->valueCount(j) == 0)
			{
				if(pPat[j] == UNKNOWN_REAL_VALUE)
				{
					drop = true;
					break;
				}
			}
			else
			{
				if(pPat[j] == UNKNOWN_DISCRETE_VALUE)
				{
					drop = true;
					break;
				}
			}
		}
		if(drop)
			pData->deleteRow(i);
	}
	pData->print(cout);
}
예제 #12
0
파일: main.cpp 프로젝트: litaoshao/waffles
void split(GArgReader& args)
{
	// Load
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	int pats = (int)pData->rows() - args.pop_uint();
	if(pats < 0)
		ThrowError("out of range. The data only has ", to_str(pData->rows()), " rows.");
	const char* szFilename1 = args.pop_string();
	const char* szFilename2 = args.pop_string();

	unsigned int nSeed = getpid() * (unsigned int)time(NULL);
	bool shouldShuffle = false;
	while(args.size() > 0){
		if(args.if_pop("-shuffle")){
			shouldShuffle = true;
		}else if(args.if_pop("-seed")){
			nSeed = args.pop_uint();
		}else
			ThrowError("Invalid option: ", args.peek());
	}

	// Shuffle if necessary
	GRand rng(nSeed);
	if(shouldShuffle){
		pData->shuffle(rng);
	}

	// Split
	GMatrix other(pData->relation());
	pData->splitBySize(&other, pats);
	pData->saveArff(szFilename1);
	other.saveArff(szFilename2);
}
예제 #13
0
파일: main.cpp 프로젝트: litaoshao/waffles
void Discretize(GArgReader& args)
{
	// Load the file
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);

	// Parse Options
	size_t nFirst = 0;
	size_t nLast = pData->relation()->size() - 1;
	size_t nBuckets = std::max(2, (int)floor(sqrt((double)pData->rows() + 0.5)));
	while(args.size() > 0)
	{
		if(args.if_pop("-buckets"))
			nBuckets = args.pop_uint();
		else if(args.if_pop("-colrange"))
		{
			nFirst = args.pop_uint();
			nLast = args.pop_uint();
		}
		else
			ThrowError("Invalid option: ", args.peek());
	}
	if(nFirst < 0 || nLast >= pData->relation()->size() || nLast < nFirst)
		ThrowError("column index out of range");

	// Discretize the continuous attributes in the specified range
	for(size_t i = nFirst; i <= nLast; i++)
	{
		if(pData->relation()->valueCount(i) != 0)
			continue;
		double min, range;
		pData->minAndRange(i, &min, &range);
		for(size_t j = 0; j < pData->rows(); j++)
		{
			double* pPat = pData->row(j);
			pPat[i] = (double)std::max((size_t)0, std::min(nBuckets - 1, (size_t)floor(((pPat[i] - min) * nBuckets) / range)));
		}
		((GArffRelation*)pData->relation().get())->setAttrValueCount(i, nBuckets);
	}

	// Print results
	pData->print(cout);
}
예제 #14
0
void test_parsearff_quoting(){
  const char* inputArff=
    "@relation 'squares of numbers'\n"
    "\n"
    "@attribute 'the number' real\n"
    "\n"
    "@attribute 'the square of the number' real\n"
    "\n"
    "@attribute exact {'is exact', inexact,is\\\\\\ exact}\n"
    "\n"
    "@data\n"
    "1,1,'is exact'\n"
    "2,4,is\\ exact\n"
    "1.414,2,inexact\n"
    "3,9,\"is exact\"\n"
    "4,16,\"is\\ exact\"\n"
    ;

  GMatrix M;
  M.parseArff(inputArff, strlen(inputArff));
  double expected_data[5][3]={{1,1,0},{2,4,0},{1.414,2,1},{3,9,0},{4,16,2}};
  const GArffRelation* pRel = (const GArffRelation*)&M.relation();
  const GArffRelation& R = *pRel;

  TestEqual(R.size(), (std::size_t)3, "Incorrect number of attributes");
  for(unsigned row = 0; row < 5; ++row){
    for(unsigned col = 0; col < 3; ++col){
      std::stringstream errdescr;
      errdescr << "Incorrect matrix entry [" << row << "][" << col << "]";
      TestEqual(M[row][col], expected_data[row][col], errdescr.str());
    }
  }
  TestEqual(true, R.areContinuous(0,2),
	      "First or second attribute is not continuous");
  TestEqual(true, R.areNominal(2,1), "Third attribute is not nominal");

   std::stringstream val0, val1, val2;
   R.printAttrValue(val0, 2, 0);
   R.printAttrValue(val1, 2, 1);
   R.printAttrValue(val2, 2, 2);
   TestEqual("'is exact'",val0.str(),
	       "First value of third attribute incorrect name");
   TestEqual("inexact",val1.str(),
	       "Second value of third attribute incorrect name");
   TestEqual("is\\ exact",val2.str(),
	       "Third value of third attribute incorrect name");


  TestEqual("'the number'",R.attrName(0),"First attribute incorrect name");
  TestEqual("'the square of the number'",R.attrName(1),
	      "Second attribute incorrect name");
  TestEqual("exact",R.attrName(2),"Third attribute incorrect name");

}
예제 #15
0
파일: main.cpp 프로젝트: litaoshao/waffles
void splitFold(GArgReader& args)
{
	// Load
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	size_t fold = args.pop_uint();
	size_t folds = args.pop_uint();
	if(fold >= folds)
		ThrowError("fold index out of range. It must be less than the total number of folds.");

	// Options
	string filenameTrain = "train.arff";
	string filenameTest = "test.arff";
	while(args.size() > 0)
	{
		if(args.if_pop("-out"))
		{
			filenameTrain = args.pop_string();
			filenameTest = args.pop_string();
		}
		else
			ThrowError("Invalid option: ", args.peek());
	}

	// Copy relevant portions of the data
	GMatrix train(pData->relation());
	GMatrix test(pData->relation());
	size_t begin = pData->rows() * fold / folds;
	size_t end = pData->rows() * (fold + 1) / folds;
	for(size_t i = 0; i < begin; i++)
		train.copyRow(pData->row(i));
	for(size_t i = begin; i < end; i++)
		test.copyRow(pData->row(i));
	for(size_t i = end; i < pData->rows(); i++)
		train.copyRow(pData->row(i));
	train.saveArff(filenameTrain.c_str());
	test.saveArff(filenameTest.c_str());
}
예제 #16
0
파일: main.cpp 프로젝트: litaoshao/waffles
void SwapAttributes(GArgReader& args)
{
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	size_t nAttr1 = args.pop_uint();
	size_t nAttr2 = args.pop_uint();
	size_t attrCount = pData->relation()->size();
	if(nAttr1 >= attrCount)
		ThrowError("Index out of range");
	if(nAttr2 >= attrCount)
		ThrowError("Index out of range");
	pData->swapColumns(nAttr1, nAttr2);
	pData->print(cout);
}
예제 #17
0
파일: main.cpp 프로젝트: kslazarev/waffles
void ManifoldSculpting(GArgReader& args)
{
	// Load the file and params
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	unsigned int nSeed = getpid() * (unsigned int)time(NULL);
	GRand prng(nSeed);
	GNeighborFinder* pNF = instantiateNeighborFinder(pData, &prng, args);
	Holder<GNeighborFinder> hNF(pNF);
	size_t targetDims = args.pop_uint();

	// Parse Options
	const char* szPreprocessedData = NULL;
	double scaleRate = 0.999;
	while(args.size() > 0)
	{
		if(args.if_pop("-seed"))
			prng.setSeed(args.pop_uint());
		else if(args.if_pop("-continue"))
			szPreprocessedData = args.pop_string();
		else if(args.if_pop("-scalerate"))
			scaleRate = args.pop_double();
		else
			throw Ex("Invalid option: ", args.peek());
	}

	// Load the hint data
	GMatrix* pDataHint = NULL;
	Holder<GMatrix> hDataHint(NULL);
	if(szPreprocessedData)
	{
		pDataHint = loadData(szPreprocessedData);
		hDataHint.reset(pDataHint);
		if(pDataHint->relation()->size() != targetDims)
			throw Ex("Wrong number of dims in the hint data");
		if(pDataHint->rows() != pData->rows())
			throw Ex("Wrong number of patterns in the hint data");
	}

	// Transform the data
	GManifoldSculpting transform(pNF->neighborCount(), targetDims, &prng);
	transform.setSquishingRate(scaleRate);
	if(pDataHint)
		transform.setPreprocessedData(hDataHint.release());
	transform.setNeighborFinder(pNF);
	GMatrix* pDataAfter = transform.doit(*pData);
	Holder<GMatrix> hDataAfter(pDataAfter);
	pDataAfter->print(cout);
}
예제 #18
0
// virtual
void GBag::trainInnerInner(GMatrix& features, GMatrix& labels)
{
	// Train all the models
	size_t nLearnerCount = m_models.size();
	size_t nDrawSize = size_t(m_trainSize * features.rows());
	GMatrix drawnFeatures(features.relation());
	GMatrix drawnLabels(labels.relation());
	drawnFeatures.reserve(nDrawSize);
	drawnLabels.reserve(nDrawSize);
	{
		for(size_t i = 0; i < nLearnerCount; i++)
		{
			if(m_pCB)
				m_pCB(m_pThis, i, nLearnerCount);

			// Randomly draw some data (with replacement)
			GReleaseDataHolder hDrawnFeatures(&drawnFeatures);
			GReleaseDataHolder hDrawnLabels(&drawnLabels);
			for(size_t j = 0; j < nDrawSize; j++)
			{
				size_t r = (size_t)m_rand.next(features.rows());
				drawnFeatures.takeRow(features[r]);
				drawnLabels.takeRow(labels[r]);
			}

			// Train the learner with the drawn data
			m_models[i]->m_pModel->train(drawnFeatures, drawnLabels);
		}
		if(m_pCB)
			m_pCB(m_pThis, nLearnerCount, nLearnerCount);
	}

	// Determine the weights
	determineWeights(features, labels);
	normalizeWeights();
}
예제 #19
0
파일: main.cpp 프로젝트: litaoshao/waffles
void dropRandomValues(GArgReader& args)
{
	GMatrix* pData = loadData(args.pop_string());
	double portion = args.pop_double();

	// Parse the options
	unsigned int seed = getpid() * (unsigned int)time(NULL);
	while(args.next_is_flag())
	{
		if(args.if_pop("-seed"))
			seed = args.pop_uint();
		else
			ThrowError("Invalid option: ", args.peek());
	}

	GRand rand(seed);
	size_t n = pData->rows() * pData->cols();
	size_t k = size_t(portion * n);
	for(size_t i = 0; i < pData->cols(); i++)
	{
		size_t vals = pData->relation()->valueCount(i);
		if(vals == 0)
		{
			for(size_t j = 0; j < pData->rows(); j++)
			{
				if(rand.next(n) < k)
				{
					pData->row(j)[i] = UNKNOWN_REAL_VALUE;
					k--;
				}
				n--;
			}
		}
		else
		{
			for(size_t j = 0; j < pData->rows(); j++)
			{
				if(rand.next(n) < k)
				{
					pData->row(j)[i] = UNKNOWN_DISCRETE_VALUE;
					k--;
				}
				n--;
			}
		}
	}
	pData->print(cout);
}
예제 #20
0
파일: main.cpp 프로젝트: litaoshao/waffles
void rotate(GArgReader& args)
{
	GMatrix* pA = loadData(args.pop_string());
	Holder<GMatrix> hA(pA);
	sp_relation relation = pA->relation();
	unsigned colx = args.pop_uint();
	if(colx >= pA->cols()){
	  ThrowError("Rotation first column index (",to_str(colx),") "
		     "should not be greater "
		     "than the largest index, which is ", to_str(pA->cols()-1),
		     ".");
	}
	if(!relation->areContinuous(colx,1)){
	  ThrowError("Rotation first column index (",to_str(colx),") "
		     "should be continuous and it is not.");
		     
	}
	unsigned coly = args.pop_uint();
	if(coly >= pA->cols()){
	  ThrowError("Rotation second column index (",to_str(coly),") "
		     "should not be greater "
		     "than the largest index, which is ", to_str(pA->cols()-1),
		     ".");
	}
	if(!relation->areContinuous(coly,1)){
	  ThrowError("Rotation second column index (",to_str(coly),") "
		     "should be continuous and it is not.");
	}
	
	double angle = args.pop_double();

	angle = angle * M_PI / 180; //Convert from degrees to radians
	double cosAngle = std::cos(angle);
	double sinAngle = std::sin(angle);
	for(std::size_t rowIdx = 0; rowIdx < pA->rows(); ++rowIdx){
	  double* row = (*pA)[rowIdx];
	  double x = row[colx];
	  double y = row[coly];
	  row[colx]=x*cosAngle-y*sinAngle;
	  row[coly]=x*sinAngle+y*cosAngle;
	}
	pA->print(cout);
}
예제 #21
0
파일: main.cpp 프로젝트: kslazarev/waffles
void attributeSelector(GArgReader& args)
{
	// Load the data
	size_t labelDims;
	std::vector<size_t> originalIndices;
	GMatrix data;
	loadDataWithSwitches(data, args, labelDims, originalIndices);

	// Parse the options
	unsigned int seed = getpid() * (unsigned int)time(NULL);
	int targetFeatures = 1;
	string outFilename = "";
	while(args.next_is_flag())
	{
		if(args.if_pop("-seed"))
			seed = args.pop_uint();
		else if(args.if_pop("-out"))
		{
			targetFeatures = args.pop_uint();
			outFilename = args.pop_string();
		}
		else
			throw Ex("Invalid neighbor finder option: ", args.peek());
	}

	// Do the attribute selection
	GRand prng(seed);
	GAttributeSelector as(labelDims, targetFeatures, &prng);
	if(outFilename.length() > 0)
	{
		as.train(data);
		GMatrix* pDataOut = as.transformBatch(data);
		Holder<GMatrix> hDataOut(pDataOut);
		cout << "Reduced data saved to " << outFilename.c_str() << ".\n";
		pDataOut->saveArff(outFilename.c_str());
	}
	else
		as.train(data);
	cout << "\nAttribute rankings from most salient to least salient. (Attributes are zero-indexed.)\n";
	GArffRelation* pRel = (GArffRelation*)data.relation().get();
	for(size_t i = 0; i < as.ranks().size(); i++)
	  cout << originalIndices.at(as.ranks()[i]) << " " << pRel->attrName(as.ranks()[i]) << "\n";
}
예제 #22
0
// virtual
void GEnsemble::trainInner(const GMatrix& features, const GMatrix& labels)
{
	delete(m_pLabelRel);
	m_pLabelRel = labels.relation().clone();

	// Make the accumulator buffer
	size_t labelDims = m_pLabelRel->size();
	size_t nAccumulatorDims = 0;
	for(size_t i = 0; i < labelDims; i++)
	{
		size_t nValues = m_pLabelRel->valueCount(i);
		if(nValues > 0)
			nAccumulatorDims += nValues;
		else
			nAccumulatorDims += 2; // mean and variance
	}
	m_accumulator.resize(nAccumulatorDims);

	trainInnerInner(features, labels);
}
예제 #23
0
// virtual
void GEnsemble::trainInner(GMatrix& features, GMatrix& labels)
{
	m_pLabelRel = labels.relation();

	// Make the accumulator buffer
	size_t labelDims = m_pLabelRel->size();
	m_nAccumulatorDims = 0;
	for(size_t i = 0; i < labelDims; i++)
	{
		size_t nValues = m_pLabelRel->valueCount(i);
		if(nValues > 0)
			m_nAccumulatorDims += nValues;
		else
			m_nAccumulatorDims += 2; // mean and variance
	}
	delete[] m_pAccumulator;
	m_pAccumulator = new double[m_nAccumulatorDims];

	trainInnerInner(features, labels);
}
예제 #24
0
// virtual
void GNaiveBayes::trainSparse(GSparseMatrix& features, GMatrix& labels)
{
	if(features.rows() != labels.rows())
		throw Ex("Expected the features and labels to have the same number of rows");
	size_t featureDims = features.cols();
	GUniformRelation featureRel(featureDims, 2);
	beginIncrementalLearning(featureRel, labels.relation());
	GVec fullRow(featureDims);
	for(size_t n = 0; n < features.rows(); n++)
	{
		features.fullRow(fullRow, n);
		for(size_t i = 0; i < featureDims; i++)
		{
			if(fullRow[i] < 1e-6)
				fullRow[i] = 0.0;
			else
				fullRow[i] = 1.0;
		}
		trainIncremental(fullRow, labels[n]);
	}
}
예제 #25
0
파일: main.cpp 프로젝트: litaoshao/waffles
void transition(GArgReader& args)
{
	// Load the input data
	GMatrix* pActions = loadData(args.pop_string());
	Holder<GMatrix> hActions(pActions);
	GMatrix* pState = loadData(args.pop_string());
	Holder<GMatrix> hState(pState);
	if(pState->rows() != pActions->rows())
		ThrowError("Expected the same number of rows in both datasets");

	// Parse options
	bool delta = false;
	while(args.size() > 0)
	{
		if(args.if_pop("-delta"))
			delta = true;
		else
			ThrowError("Invalid option: ", args.peek());
	}

	// Make the output data
	size_t actionDims = pActions->cols();
	size_t stateDims = pState->cols();
	GMixedRelation* pRelation = new GMixedRelation();
	sp_relation pRel = pRelation;
	pRelation->addAttrs(pActions->relation().get());
	pRelation->addAttrs(stateDims + stateDims, 0);
	GMatrix* pTransition = new GMatrix(pRel);
	pTransition->newRows(pActions->rows() - 1);
	for(size_t i = 0; i < pActions->rows() - 1; i++)
	{
		double* pOut = pTransition->row(i);
		GVec::copy(pOut, pActions->row(i), actionDims);
		GVec::copy(pOut + actionDims, pState->row(i), stateDims);
		GVec::copy(pOut + actionDims + stateDims, pState->row(i + 1), stateDims);
		if(delta)
			GVec::subtract(pOut + actionDims + stateDims, pState->row(i), stateDims);
	}
	pTransition->print(cout);
}
예제 #26
0
파일: main.cpp 프로젝트: litaoshao/waffles
void Import(GArgReader& args)
{
	// Load the file
	size_t len;
	const char* filename = args.pop_string();
	char* pFile = GFile::loadFile(filename, &len);
	ArrayHolder<char> hFile(pFile);

	// Parse Options
	char separator = ',';
	bool tolerant = false;
	bool columnNamesInFirstRow = false;
	while(args.size() > 0)
	{
		if(args.if_pop("-tab"))
			separator = '\t';
		else if(args.if_pop("-space"))
			separator = ' ';
		else if(args.if_pop("-whitespace"))
			separator = '\0';
		else if(args.if_pop("-semicolon"))
			separator = ';';
		else if(args.if_pop("-separator"))
			separator = args.pop_string()[0];
		else if(args.if_pop("-tolerant"))
			tolerant = true;
		else if(args.if_pop("-columnnames"))
			columnNamesInFirstRow = true;
		else
			ThrowError("Invalid option: ", args.peek());
	}

	// Parse the file
	GMatrix* pData = GMatrix::parseCsv(pFile, len, separator, columnNamesInFirstRow, tolerant);
	Holder<GMatrix> hData(pData);
	((GArffRelation*)pData->relation().get())->setName(filename);

	// Print the data
	pData->print(cout);
}
예제 #27
0
파일: main.cpp 프로젝트: litaoshao/waffles
void Export(GArgReader& args)
{
	// Load
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);

	// Parse options
	const char* separator = ",";
	while(args.size() > 0)
	{
		if(args.if_pop("-tab"))
			separator = "	";
		else if(args.if_pop("-space"))
			separator = " ";
		else
			ThrowError("Invalid option: ", args.peek());
	}

	// Print
	for(size_t i = 0; i < pData->rows(); i++)
		pData->relation()->printRow(cout, pData->row(i), separator);
}
예제 #28
0
파일: main.cpp 프로젝트: litaoshao/waffles
void aggregateRows(GArgReader& args)
{
	size_t r = args.pop_uint();
	vector<string> files;
	GFile::fileList(files);
	GMatrix* pResults = NULL;
	Holder<GMatrix> hResults;
	for(vector<string>::iterator it = files.begin(); it != files.end(); it++)
	{
		PathData pd;
		GFile::parsePath(it->c_str(), &pd);
		if(strcmp(it->c_str() + pd.extStart, ".arff") != 0)
			continue;
		GMatrix* pData = loadData(it->c_str());
		Holder<GMatrix> hData(pData);
		if(!pResults)
		{
			pResults = new GMatrix(pData->relation());
			hResults.reset(pResults);
		}
		pResults->takeRow(pData->releaseRow(r));
	}
	pResults->print(cout);
}
예제 #29
0
파일: main.cpp 프로젝트: litaoshao/waffles
void SortByAttribute(GArgReader& args)
{
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	size_t nAttr = args.pop_uint();
	size_t attrCount = pData->relation()->size();
	if(nAttr >= attrCount)
		ThrowError("Index out of range");

	// Parse options
	bool descending = false;
	while(args.size() > 0)
	{
		if(args.if_pop("-descending"))
			descending = true;
		else
			ThrowError("Invalid option: ", args.peek());
	}

	pData->sort(nAttr);
	if(descending)
		pData->reverseRows();
	pData->print(cout);
}
예제 #30
0
// virtual
void GResamplingAdaBoost::trainInnerInner(const GMatrix& features, const GMatrix& labels)
{
	clear();

	// Initialize all instances with uniform weights
	GVec pDistribution(features.rows());
	pDistribution.fill(1.0 / features.rows());
	size_t drawRows = size_t(m_trainSize * features.rows());
	size_t* pDrawnIndexes = new size_t[drawRows];
	std::unique_ptr<size_t[]> hDrawnIndexes(pDrawnIndexes);

	// Train the ensemble
	size_t labelDims = labels.cols();
	double penalty = 1.0 / labelDims;
	GVec prediction(labelDims);
	for(size_t es = 0; es < m_ensembleSize; es++)
	{
		// Draw a training set from the distribution
		GCategoricalSamplerBatch csb(features.rows(), pDistribution, m_rand);
		csb.draw(drawRows, pDrawnIndexes);
		GMatrix drawnFeatures(features.relation().clone());
		GReleaseDataHolder hDrawnFeatures(&drawnFeatures);
		GMatrix drawnLabels(labels.relation().clone());
		GReleaseDataHolder hDrawnLabels(&drawnLabels);
		size_t* pIndex = pDrawnIndexes;
		for(size_t i = 0; i < drawRows; i++)
		{
			drawnFeatures.takeRow((GVec*)&features[*pIndex]);
			drawnLabels.takeRow((GVec*)&labels[*pIndex]);
			pIndex++;
		}

		// Train an instance of the model and store a clone of it
		m_pLearner->train(drawnFeatures, drawnLabels);
		GDom doc;
		GSupervisedLearner* pClone = m_pLoader->loadLearner(m_pLearner->serialize(&doc));

		// Compute model weight
		double err = 0.5;
		for(size_t i = 0; i < features.rows(); i++)
		{
			pClone->predict(features[i], prediction);
			const GVec& target = labels[i];
			for(size_t j = 0; j < labelDims; j++)
			{
				if((int)target[j] != (int)prediction[j])
					err += penalty;
			}
		}
		err /= features.rows();
		if(err >= 0.5)
		{
			delete(pClone);
			break;
		}
		double weight = 0.5 * log((1.0 - err) / err);
		m_models.push_back(new GWeightedModel(weight, pClone));

		// Update the distribution to favor mis-classified instances
		for(size_t i = 0; i < features.rows(); i++)
		{
			err = 0.0;
			pClone->predict(features[i], prediction);
			const GVec& target = labels[i];
			for(size_t j = 0; j < labelDims; j++)
			{
				if((int)target[j] != (int)prediction[j])
					err += penalty;
			}
			err /= labelDims;
			pDistribution[i] *= exp(weight * (err * 2.0 - 1.0));
		}
		pDistribution.sumToOne();
	}
	normalizeWeights();
}