Exemplo n.º 1
0
void autoCorrelation(GArgReader& args)
{
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	size_t lag = std::min((size_t)256, pData->rows() / 2);
	size_t dims = pData->cols();
	GTEMPBUF(double, mean, dims);
	pData->centroid(mean);
	GMatrix ac(0, dims + 1);
	for(size_t i = 1; i <= lag; i++)
	{
		double* pRow = ac.newRow();
		*(pRow++) = (double)i;
		for(size_t j = 0; j < dims; j++)
		{
			*pRow = 0;
			size_t k;
			for(k = 0; k + i < pData->rows(); k++)
			{
				double* pA = pData->row(k);
				double* pB = pData->row(k + i);
				*pRow += (pA[j] - mean[j]) * (pB[j] - mean[j]);
			}
			*pRow /= k;
			pRow++;
		}
	}
	ac.print(cout);
}
Exemplo n.º 2
0
	void TransformData(const double* pVector)
	{
		m_transform.fromVector(pVector + m_attrs, m_attrs);
		for(size_t i = 0; i < m_pData2->rows(); i++)
		{
			double* pPatIn = m_pData2->row(i);
			double* pPatOut = m_transformed.row(i);
			m_transform.multiply(pPatIn, pPatOut);
			GVec::add(pPatOut, pVector, m_attrs);
		}
	}
Exemplo n.º 3
0
///TODO: this command should be documented
void center(GArgReader& args)
{
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	unsigned int r = args.pop_uint();
	size_t cols = pData->cols();
	double* pRow = pData->row(r);
	for(size_t i = 0; i < r; ++i)
		GVec::subtract(pData->row(i), pRow, cols);
	for(size_t i = r + 1; i < pData->rows(); ++i)
		GVec::subtract(pData->row(i), pRow, cols);
	GVec::setAll(pRow, 0.0, cols);
	pData->print(cout);
}
Exemplo n.º 4
0
void dropRandomValues(GArgReader& args)
{
	GMatrix* pData = loadData(args.pop_string());
	double portion = args.pop_double();

	// Parse the options
	unsigned int seed = getpid() * (unsigned int)time(NULL);
	while(args.next_is_flag())
	{
		if(args.if_pop("-seed"))
			seed = args.pop_uint();
		else
			ThrowError("Invalid option: ", args.peek());
	}

	GRand rand(seed);
	size_t n = pData->rows() * pData->cols();
	size_t k = size_t(portion * n);
	for(size_t i = 0; i < pData->cols(); i++)
	{
		size_t vals = pData->relation()->valueCount(i);
		if(vals == 0)
		{
			for(size_t j = 0; j < pData->rows(); j++)
			{
				if(rand.next(n) < k)
				{
					pData->row(j)[i] = UNKNOWN_REAL_VALUE;
					k--;
				}
				n--;
			}
		}
		else
		{
			for(size_t j = 0; j < pData->rows(); j++)
			{
				if(rand.next(n) < k)
				{
					pData->row(j)[i] = UNKNOWN_DISCRETE_VALUE;
					k--;
				}
				n--;
			}
		}
	}
	pData->print(cout);
}
Exemplo n.º 5
0
void cumulativeColumns(GArgReader& args)
{
	GMatrix* pA = loadData(args.pop_string());
	Holder<GMatrix> hA(pA);
	vector<size_t> cols;
	parseAttributeList(cols, args, pA->cols());
	double* pPrevRow = pA->row(0);
	for(size_t i = 1; i < pA->rows(); i++)
	{
		double* pRow = pA->row(i);
		for(vector<size_t>::iterator it = cols.begin(); it != cols.end(); it++)
			pRow[*it] += pPrevRow[*it];
		pPrevRow = pRow;
	}
	pA->print(cout);
}
Exemplo n.º 6
0
void addNoise(GArgReader& args)
{
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	double dev = args.pop_double();

	// Parse the options
	unsigned int seed = getpid() * (unsigned int)time(NULL);
	int excludeLast = 0;
	while(args.next_is_flag())
	{
		if(args.if_pop("-seed"))
			seed = args.pop_uint();
		else if(args.if_pop("-excludelast"))
			excludeLast = args.pop_uint();
		else
			ThrowError("Invalid neighbor finder option: ", args.peek());
	}

	GRand prng(seed);
	size_t cols = pData->cols() - excludeLast;
	for(size_t r = 0; r < pData->rows(); r++)
	{
		double* pRow = pData->row(r);
		for(size_t c = 0; c < cols; c++)
			*(pRow++) += dev * prng.normal();
	}
	pData->print(cout);
}
Exemplo n.º 7
0
void enumerateValues(GArgReader& args)
{
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	size_t col = args.pop_uint();
	if(pData->relation()->valueCount(col) > 0)
		((GArffRelation*)pData->relation().get())->setAttrValueCount(col, 0);
	else
	{
		size_t n = 0;
		map<double,size_t> themap;
		for(size_t i = 0; i < pData->rows(); i++)
		{
			double* pRow = pData->row(i);
			map<double,size_t>::iterator it = themap.find(pRow[col]);
			if(it == themap.end())
			{
				themap[pRow[col]] = n;
				pRow[col] = (double)n;
				n++;
			}
			else
				pRow[col] = (double)it->second;
		}
	}
	pData->print(cout);
}
Exemplo n.º 8
0
void DropMissingValues(GArgReader& args)
{
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	GRelation* pRelation = pData->relation().get();
	size_t dims = pRelation->size();
	for(size_t i = pData->rows() - 1; i < pData->rows(); i--)
	{
		double* pPat = pData->row(i);
		bool drop = false;
		for(size_t j = 0; j < dims; j++)
		{
			if(pRelation->valueCount(j) == 0)
			{
				if(pPat[j] == UNKNOWN_REAL_VALUE)
				{
					drop = true;
					break;
				}
			}
			else
			{
				if(pPat[j] == UNKNOWN_DISCRETE_VALUE)
				{
					drop = true;
					break;
				}
			}
		}
		if(drop)
			pData->deleteRow(i);
	}
	pData->print(cout);
}
Exemplo n.º 9
0
GMatrix::GMatrix(GMatrix& m)
{
    m_row = m.row();
    m_col = m.col();
    int size = m_row*m_col;
    m_data = new double[size];
    for(int i = 0; i < size; ++i)
    {
        m_data[i] = m[i];
    }
}
Exemplo n.º 10
0
void transition(GArgReader& args)
{
	// Load the input data
	GMatrix* pActions = loadData(args.pop_string());
	Holder<GMatrix> hActions(pActions);
	GMatrix* pState = loadData(args.pop_string());
	Holder<GMatrix> hState(pState);
	if(pState->rows() != pActions->rows())
		ThrowError("Expected the same number of rows in both datasets");

	// Parse options
	bool delta = false;
	while(args.size() > 0)
	{
		if(args.if_pop("-delta"))
			delta = true;
		else
			ThrowError("Invalid option: ", args.peek());
	}

	// Make the output data
	size_t actionDims = pActions->cols();
	size_t stateDims = pState->cols();
	GMixedRelation* pRelation = new GMixedRelation();
	sp_relation pRel = pRelation;
	pRelation->addAttrs(pActions->relation().get());
	pRelation->addAttrs(stateDims + stateDims, 0);
	GMatrix* pTransition = new GMatrix(pRel);
	pTransition->newRows(pActions->rows() - 1);
	for(size_t i = 0; i < pActions->rows() - 1; i++)
	{
		double* pOut = pTransition->row(i);
		GVec::copy(pOut, pActions->row(i), actionDims);
		GVec::copy(pOut + actionDims, pState->row(i), stateDims);
		GVec::copy(pOut + actionDims + stateDims, pState->row(i + 1), stateDims);
		if(delta)
			GVec::subtract(pOut + actionDims + stateDims, pState->row(i), stateDims);
	}
	pTransition->print(cout);
}
Exemplo n.º 11
0
void splitFold(GArgReader& args)
{
	// Load
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	size_t fold = args.pop_uint();
	size_t folds = args.pop_uint();
	if(fold >= folds)
		ThrowError("fold index out of range. It must be less than the total number of folds.");

	// Options
	string filenameTrain = "train.arff";
	string filenameTest = "test.arff";
	while(args.size() > 0)
	{
		if(args.if_pop("-out"))
		{
			filenameTrain = args.pop_string();
			filenameTest = args.pop_string();
		}
		else
			ThrowError("Invalid option: ", args.peek());
	}

	// Copy relevant portions of the data
	GMatrix train(pData->relation());
	GMatrix test(pData->relation());
	size_t begin = pData->rows() * fold / folds;
	size_t end = pData->rows() * (fold + 1) / folds;
	for(size_t i = 0; i < begin; i++)
		train.copyRow(pData->row(i));
	for(size_t i = begin; i < end; i++)
		test.copyRow(pData->row(i));
	for(size_t i = end; i < pData->rows(); i++)
		train.copyRow(pData->row(i));
	train.saveArff(filenameTrain.c_str());
	test.saveArff(filenameTest.c_str());
}
Exemplo n.º 12
0
void shiftColumns(GArgReader& args)
{
	GMatrix* pA = loadData(args.pop_string());
	Holder<GMatrix> hA(pA);
	vector<size_t> cols;
	parseAttributeList(cols, args, pA->cols());
	double offset = args.pop_double();
	for(size_t i = 0; i < pA->rows(); i++)
	{
		double* pRow = pA->row(i);
		for(vector<size_t>::iterator it = cols.begin(); it != cols.end(); it++)
			pRow[*it] += offset;
	}
	pA->print(cout);
}
Exemplo n.º 13
0
void Discretize(GArgReader& args)
{
	// Load the file
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);

	// Parse Options
	size_t nFirst = 0;
	size_t nLast = pData->relation()->size() - 1;
	size_t nBuckets = std::max(2, (int)floor(sqrt((double)pData->rows() + 0.5)));
	while(args.size() > 0)
	{
		if(args.if_pop("-buckets"))
			nBuckets = args.pop_uint();
		else if(args.if_pop("-colrange"))
		{
			nFirst = args.pop_uint();
			nLast = args.pop_uint();
		}
		else
			ThrowError("Invalid option: ", args.peek());
	}
	if(nFirst < 0 || nLast >= pData->relation()->size() || nLast < nFirst)
		ThrowError("column index out of range");

	// Discretize the continuous attributes in the specified range
	for(size_t i = nFirst; i <= nLast; i++)
	{
		if(pData->relation()->valueCount(i) != 0)
			continue;
		double min, range;
		pData->minAndRange(i, &min, &range);
		for(size_t j = 0; j < pData->rows(); j++)
		{
			double* pPat = pData->row(j);
			pPat[i] = (double)std::max((size_t)0, std::min(nBuckets - 1, (size_t)floor(((pPat[i] - min) * nBuckets) / range)));
		}
		((GArffRelation*)pData->relation().get())->setAttrValueCount(i, nBuckets);
	}

	// Print results
	pData->print(cout);
}
Exemplo n.º 14
0
void Export(GArgReader& args)
{
	// Load
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);

	// Parse options
	const char* separator = ",";
	while(args.size() > 0)
	{
		if(args.if_pop("-tab"))
			separator = "	";
		else if(args.if_pop("-space"))
			separator = " ";
		else
			ThrowError("Invalid option: ", args.peek());
	}

	// Print
	for(size_t i = 0; i < pData->rows(); i++)
		pData->relation()->printRow(cout, pData->row(i), separator);
}
Exemplo n.º 15
0
GSparseMatrix* GRecommenderLib::loadSparseData(const char* szFilename)
{
	// Load the dataset by extension
	PathData pd;
	GFile::parsePath(szFilename, &pd);
	if(_stricmp(szFilename + pd.extStart, ".arff") == 0)
	{
		// Convert a 3-column dense ARFF file to a sparse matrix
		GMatrix data;
		data.loadArff(szFilename);
		if(data.cols() != 3)
			throw Ex("Expected 3 columns: 0) user or row-index, 1) item or col-index, 2) value or rating");
		double m0 = data.columnMin(0);
		double r0 = data.columnMax(0) - m0;
		double m1 = data.columnMin(1);
		double r1 = data.columnMax(1) - m1;
		if(m0 < 0 || m0 > 1e10 || r0 < 2 || r0 > 1e10)
			throw Ex("Invalid row indexes");
		if(m1 < 0 || m1 > 1e10 || r1 < 2 || r1 > 1e10)
			throw Ex("Invalid col indexes");
		GSparseMatrix* pMatrix = new GSparseMatrix(size_t(m0 + r0) + 1, size_t(m1 + r1) + 1, UNKNOWN_REAL_VALUE);
		std::unique_ptr<GSparseMatrix> hMatrix(pMatrix);
		for(size_t i = 0; i < data.rows(); i++)
		{
			GVec& row = data.row(i);
			pMatrix->set(size_t(row[0]), size_t(row[1]), row[2]);
		}
		return hMatrix.release();
	}
	else if(_stricmp(szFilename + pd.extStart, ".sparse") == 0)
	{
		GDom doc;
		doc.loadJson(szFilename);
		return new GSparseMatrix(doc.root());
	}
	throw Ex("Unsupported file format: ", szFilename + pd.extStart);
	return NULL;
}
Exemplo n.º 16
0
GSparseMatrix* loadSparseData(const char* szFilename)
{
	// Load the dataset by extension
	PathData pd;
	GFile::parsePath(szFilename, &pd);
	if(_stricmp(szFilename + pd.extStart, ".arff") == 0)
	{
		// Convert a 3-column dense ARFF file to a sparse matrix
		GMatrix* pData = GMatrix::loadArff(szFilename);
		if(pData->cols() != 3)
			ThrowError("Expected 3 columns: 0) user or row-index, 1) item or col-index, 2) value or rating");
		double m0, r0, m1, r1;
		pData->minAndRange(0, &m0, &r0);
		pData->minAndRange(1, &m1, &r1);
		if(m0 < 0 || m0 > 1e10 || r0 < 2 || r0 > 1e10)
			ThrowError("Invalid row indexes");
		if(m1 < 0 || m1 > 1e10 || r1 < 2 || r1 > 1e10)
			ThrowError("Invalid col indexes");
		GSparseMatrix* pMatrix = new GSparseMatrix(size_t(m0 + r0) + 1, size_t(m1 + r1) + 1, UNKNOWN_REAL_VALUE);
		Holder<GSparseMatrix> hMatrix(pMatrix);
		for(size_t i = 0; i < pData->rows(); i++)
		{
			double* pRow = pData->row(i);
			pMatrix->set(size_t(pRow[0]), size_t(pRow[1]), pRow[2]);
		}
		return hMatrix.release();
	}
	else if(_stricmp(szFilename + pd.extStart, ".sparse") == 0)
	{
		GDom doc;
		doc.loadJson(szFilename);
		return new GSparseMatrix(doc.root());
	}
	ThrowError("Unsupported file format: ", szFilename + pd.extStart);
	return NULL;
}
Exemplo n.º 17
0
void fillMissingValues(GArgReader& args)
{
	unsigned int seed = getpid() * (unsigned int)time(NULL);
	while(args.next_is_flag())
	{
		if(args.if_pop("-seed"))
			seed = args.pop_uint();
		else
			ThrowError("Invalid option: ", args.peek());
	}

	// Load the data and the filter
	GMatrix* pDataOrig = GMatrix::loadArff(args.pop_string());
	Holder<GMatrix> hDataOrig(pDataOrig);
	sp_relation pOrigRel = pDataOrig->relation();
	GRand prng(seed);
	GCollaborativeFilter* pModel = InstantiateAlgorithm(prng, args);
	Holder<GCollaborativeFilter> hModel(pModel);
	if(args.size() > 0)
		ThrowError("Superfluous argument: ", args.peek());

	// Convert to all normalized real values
	GNominalToCat* pNtc = new GNominalToCat();
	GTwoWayTransformChainer filter(new GNormalize(), pNtc);
	pNtc->preserveUnknowns();
	filter.train(*pDataOrig);
	GMatrix* pData = filter.transformBatch(*pDataOrig);
	Holder<GMatrix> hData(pData);
	hDataOrig.release();
	pDataOrig = NULL;

	// Convert to 3-column form
	GMatrix* pMatrix = new GMatrix(0, 3);
	Holder<GMatrix> hMatrix(pMatrix);
	size_t dims = pData->cols();
	for(size_t i = 0; i < pData->rows(); i++)
	{
		double* pRow = pData->row(i);
		for(size_t j = 0; j < dims; j++)
		{
			if(*pRow != UNKNOWN_REAL_VALUE)
			{
				double* pVec = pMatrix->newRow();
				pVec[0] = i;
				pVec[1] = j;
				pVec[2] = *pRow;
			}
			pRow++;
		}
	}

	// Train the collaborative filter
	pModel->train(*pMatrix);
	hMatrix.release();
	pMatrix = NULL;

	// Predict values for missing elements
	for(size_t i = 0; i < pData->rows(); i++)
	{
		double* pRow = pData->row(i);
		for(size_t j = 0; j < dims; j++)
		{
			if(*pRow == UNKNOWN_REAL_VALUE)
				*pRow = pModel->predict(i, j);
			GAssert(*pRow != UNKNOWN_REAL_VALUE);
			pRow++;
		}
	}

	// Convert the data back to its original form
	GMatrix* pOut = filter.untransformBatch(*pData);
	pOut->setRelation(pOrigRel);
	pOut->print(cout);
}
Exemplo n.º 18
0
void Train(GArgReader &args)
{
	// Load series from file
	std::unique_ptr<GMatrix> hSeries, hFeatures;
	LoadData(args, hSeries);
	GMatrix *pSeries = hSeries.get();
	
	// Split features/labels
	if(pSeries->cols() == 2)
	{
		GMatrix *pFeatures = pSeries->cloneSub(0, 0, pSeries->rows(), 1);
		GMatrix *pLabels = pSeries->cloneSub(0, 1, pSeries->rows(), 1);
		hFeatures.reset(pFeatures);
		hSeries.reset(pLabels);
		pSeries = pLabels;
	}
	else if(pSeries->cols() > 2)
	{
		throw Ex("Too many columns!");
	}
	
	// Parse options
	GNeuralDecomposition *nd = new GNeuralDecomposition();
	while(args.next_is_flag())
	{
		if(args.if_pop("-regularization"))
			nd->setRegularization(args.pop_double());
		else if(args.if_pop("-learningRate"))
			nd->setLearningRate(args.pop_double());
		else if(args.if_pop("-linearUnits"))
			nd->setLinearUnits(args.pop_uint());
		else if(args.if_pop("-softplusUnits"))
			nd->setSoftplusUnits(args.pop_uint());
		else if(args.if_pop("-sigmoidUnits"))
			nd->setSigmoidUnits(args.pop_uint());
		else if(args.if_pop("-epochs"))
			nd->setEpochs(args.pop_uint());
		else if(args.if_pop("-features"))
			LoadData(args, hFeatures);
		else if(args.if_pop("-filterLogarithm"))
			nd->setFilterLogarithm(true);
		else
			throw Ex("Invalid option: ", args.peek());
	}
	
	if(hFeatures.get() == NULL)
	{
		// Generate features
		GMatrix *pFeatures = new GMatrix(pSeries->rows(), 1);
		for(size_t i = 0; i < pSeries->rows(); i++)
		{
			pFeatures->row(i)[0] = i / (double) pSeries->rows();
		}
		hFeatures.reset(pFeatures);
	}
	
	// Train
	GMatrix *pFeatures = hFeatures.get();
	nd->train(*pFeatures, *pSeries);
	
	// Output the trained model
	GDom doc;
	doc.setRoot(nd->serialize(&doc));
	doc.writeJson(cout);
}
Exemplo n.º 19
0
// virtual
GMatrix* GGraphCutTransducer::transduceInner(const GMatrix& features1, const GMatrix& labels1, const GMatrix& features2)
{
	// Use k-NN to compute a distance metric with good scale factors for prediction
	GKNN knn;
	knn.setNeighborCount(m_neighborCount);
	//knn.setOptimizeScaleFactors(true);
	knn.train(features1, labels1);
	GRowDistanceScaled* pMetric = knn.metric();

	// Merge the features into one dataset and build a kd-tree
	GMatrix both(features1.relation().clone());
	GReleaseDataHolder hBoth(&both);
	both.reserve(features1.rows() + features2.rows());
	for(size_t i = 0; i < features1.rows(); i++)
		both.takeRow((double*)features1[i]);
	for(size_t i = 0; i < features2.rows(); i++)
		both.takeRow((double*)features2[i]);
	GRowDistanceScaled metric2;
	GKdTree neighborFinder(&both, m_neighborCount, &metric2, false);
	GVec::copy(metric2.scaleFactors(), pMetric->scaleFactors(), features1.cols());

	// Transduce
	GMatrix* pOut = new GMatrix(labels1.relation().clone());
	Holder<GMatrix> hOut(pOut);
	pOut->newRows(features2.rows());
	pOut->setAll(0);
	for(size_t lab = 0; lab < labels1.cols(); lab++)
	{
		// Use max-flow/min-cut graph-cut to separate out each label value
		int valueCount = (int)labels1.relation().valueCount(lab);
		for(int val = 1; val < valueCount; val++)
		{
			// Add neighborhood edges
			GGraphCut gc(features1.rows() + features2.rows() + 2);
			for(size_t i = 0; i < both.rows(); i++)
			{
				neighborFinder.neighbors(m_pNeighbors, m_pDistances, i);
				for(size_t j = 0; j < m_neighborCount; j++)
				{
					if(m_pNeighbors[j] >= both.rows())
						continue;
					gc.addEdge(2 + i, 2 + m_pNeighbors[j], (float)(1.0 / std::max(sqrt(m_pDistances[j]), 1e-9))); // connect neighbors
				}
			}

			// Add source and sink edges
			for(size_t i = 0; i < features1.rows(); i++)
			{
				if((int)labels1[i][0] == val)
					gc.addEdge(0, 2 + i, 1e12f); // connect to source
				else
					gc.addEdge(1, 2 + i, 1e12f); // connect to sink
			}

			// Cut
			gc.cut(0, 1);

			// Label the unlabeled rows
			for(size_t i = 0; i < features2.rows(); i++)
			{
				if(gc.isSource(2 + features1.rows() + i))
					pOut->row(i)[lab] = (double)val;
			}
		}
	}
	return hOut.release();
}
Exemplo n.º 20
0
void GRecommenderLib::fillMissingValues(GArgReader& args)
{
	unsigned int seed = getpid() * (unsigned int)time(NULL);
	bool normalize = true;
	while(args.next_is_flag())
	{
		if(args.if_pop("-seed"))
			seed = args.pop_uint();
		else if(args.if_pop("-nonormalize"))
			normalize = false;
		else
			throw Ex("Invalid option: ", args.peek());
	}

	// Load the data and the filter
	GMatrix dataOrig;
	dataOrig.loadArff(args.pop_string());

	// Parse params
	vector<size_t> ignore;
	while(args.next_is_flag())
	{
		if(args.if_pop("-ignore"))
			parseAttributeList(ignore, args, dataOrig.cols());
		else
			throw Ex("Invalid option: ", args.peek());
	}

	// Throw out the ignored attributes
	std::sort(ignore.begin(), ignore.end());
	for(size_t i = ignore.size() - 1; i < ignore.size(); i--)
		dataOrig.deleteColumns(ignore[i], 1);

	GRelation* pOrigRel = dataOrig.relation().clone();
	std::unique_ptr<GRelation> hOrigRel(pOrigRel);
	GCollaborativeFilter* pModel = InstantiateAlgorithm(args);
	std::unique_ptr<GCollaborativeFilter> hModel(pModel);
	if(args.size() > 0)
		throw Ex("Superfluous argument: ", args.peek());
	pModel->rand().setSeed(seed);

	// Convert to all normalized real values
	GNominalToCat* pNtc = new GNominalToCat();
	GIncrementalTransform* pFilter = pNtc;
	std::unique_ptr<GIncrementalTransformChainer> hChainer;
	if(normalize)
	{
		GIncrementalTransformChainer* pChainer = new GIncrementalTransformChainer(new GNormalize(), pNtc);
		hChainer.reset(pChainer);
		pFilter = pChainer;
	}
	pNtc->preserveUnknowns();
	pFilter->train(dataOrig);
	GMatrix* pData = pFilter->transformBatch(dataOrig);
	std::unique_ptr<GMatrix> hData(pData);

	// Convert to 3-column form
	GMatrix* pMatrix = new GMatrix(0, 3);
	std::unique_ptr<GMatrix> hMatrix(pMatrix);
	size_t dims = pData->cols();
	for(size_t i = 0; i < pData->rows(); i++)
	{
		GVec& row = pData->row(i);
		for(size_t j = 0; j < dims; j++)
		{
			if(row[j] != UNKNOWN_REAL_VALUE)
			{
				GVec& vec = pMatrix->newRow();
				vec[0] = (double)i;
				vec[1] = (double)j;
				vec[2] = row[j];
			}
		}
	}

	// Train the collaborative filter
	pModel->train(*pMatrix);
	hMatrix.release();
	pMatrix = NULL;

	// Predict values for missing elements
	for(size_t i = 0; i < pData->rows(); i++)
	{
		GVec& row = pData->row(i);
		for(size_t j = 0; j < dims; j++)
		{
			if(row[j] == UNKNOWN_REAL_VALUE)
				row[j] = pModel->predict(i, j);
			GAssert(row[j] != UNKNOWN_REAL_VALUE);
		}
	}

	// Convert the data back to its original form
	GMatrix* pOut = pFilter->untransformBatch(*pData);
	pOut->setRelation(hOrigRel.release());
	pOut->print(cout);
}
Exemplo n.º 21
0
void principalComponentAnalysis(GArgReader& args)
{
	// Load the file
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	int nTargetDims = args.pop_uint();

	// Parse options
	string roundTrip;
	unsigned int seed = getpid() * (unsigned int)time(NULL);
	string eigenvalues;
	string components;
	string modelIn;
	string modelOut;
	bool aboutOrigin = false;
	while(args.next_is_flag())
	{
		if(args.if_pop("-seed"))
			seed = args.pop_uint();
		else if(args.if_pop("-roundtrip"))
			roundTrip = args.pop_string();
		else if(args.if_pop("-eigenvalues"))
			eigenvalues = args.pop_string();
		else if(args.if_pop("-components"))
			components = args.pop_string();
		else if(args.if_pop("-aboutorigin"))
			aboutOrigin = true;
		else if(args.if_pop("-modelin"))
			modelIn = args.pop_string();
		else if(args.if_pop("-modelout"))
			modelOut = args.pop_string();
		else
			throw Ex("Invalid option: ", args.peek());
	}

	// Transform the data
	GRand prng(seed);
	GPCA* pTransform = NULL;
	if(modelIn.length() > 0)
	{
		GDom doc;
		doc.loadJson(modelIn.c_str());
		GLearnerLoader ll(prng);
		pTransform = new GPCA(doc.root(), ll);
	}
	else
	{
		pTransform = new GPCA(nTargetDims, &prng);
		if(aboutOrigin)
			pTransform->aboutOrigin();
		if(eigenvalues.length() > 0)
			pTransform->computeEigVals();
		pTransform->train(*pData);
	}
	Holder<GPCA> hTransform(pTransform);

	GMatrix* pDataAfter = pTransform->transformBatch(*pData);
	Holder<GMatrix> hDataAfter(pDataAfter);

	// Save the eigenvalues
	if(eigenvalues.length() > 0)
	{
		GArffRelation* pRelation = new GArffRelation();
		pRelation->addAttribute("eigenvalues", 0, NULL);
		sp_relation pRel = pRelation;
		GMatrix dataEigenvalues(pRel);
		dataEigenvalues.newRows(nTargetDims);
		double* pEigVals = pTransform->eigVals();
		for(int i = 0; i < nTargetDims; i++)
			dataEigenvalues[i][0] = pEigVals[i];
		dataEigenvalues.saveArff(eigenvalues.c_str());
	}

	// Save the components
	if(components.length() > 0)
		pTransform->components()->saveArff(components.c_str());

	// Do the round-trip
	if(roundTrip.size() > 0)
	{
		GMatrix roundTripped(pData->rows(), pData->cols());
		for(size_t i = 0; i < pData->rows(); i++)
			pTransform->untransform(pDataAfter->row(i), roundTripped.row(i));
		roundTripped.saveArff(roundTrip.c_str());
	}

	if(modelOut.length() > 0)
	{
		GDom doc;
		doc.setRoot(pTransform->serialize(&doc));
		doc.saveJson(modelOut.c_str());
	}

	pDataAfter->print(cout);
}
Exemplo n.º 22
0
void significance(GArgReader& args)
{
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	int attr1 = args.pop_uint();
	int attr2 = args.pop_uint();

	// Parse options
	double tolerance = 0.001;
	while(args.size() > 0)
	{
		if(args.if_pop("-tol"))
			tolerance = args.pop_double();
		else
			ThrowError("Invalid option: ", args.peek());
	}

	// Print some basic stats
	cout.precision(8);
	{
		cout << "### Some basic stats\n";
		cout << "Medians = " << pData->median(attr1) << ", " << pData->median(attr2) << "\n";
		double mean1 = pData->mean(attr1);
		double mean2 = pData->mean(attr2);
		cout << "Means = " << mean1 << ", " << mean2 << "\n";
		double var1 = pData->variance(attr1, mean1);
		double var2 = pData->variance(attr2, mean2);
		cout << "Standard deviations = " << sqrt(var1) << ", " << sqrt(var2) << "\n";
		int less = 0;
		int eq = 0;
		int more = 0;
		for(size_t i = 0; i < pData->rows(); i++)
		{
			double* pRow = pData->row(i);
			if(std::abs(pRow[attr1] - pRow[attr2]) < tolerance)
				eq++;
			else if(pRow[attr1] < pRow[attr2])
				less++;
			else
				more++;
		}
		cout << less << " less, " << eq << " same, " << more << " greater\n";
	}

	// Perform the significance tests
	{
		cout << "\n### Paired T-test\n";
		size_t v;
		double t;
		pData->pairedTTest(&v, &t, attr1, attr2, false);
		double p = GMath::tTestAlphaValue(v, t);
		cout << "v=" << v << ", t=" << t << ", p=" << p << "\n";
	}
	{
		cout << "\n### Paired T-test with normalized values\n";
		size_t v;
		double t;
		pData->pairedTTest(&v, &t, attr1, attr2, true);
		double p = GMath::tTestAlphaValue(v, t);
		cout << "v=" << v << ", t=" << t << ", p=" << p << "\n";
	}
	{
		cout << "\n### Wilcoxon Signed Ranks Test";
		int num;
		double wMinus, wPlus;
		pData->wilcoxonSignedRanksTest(attr1, attr2, tolerance, &num, &wMinus, &wPlus);
		cout << "Number of signed ranks: " << num << "\n";
		double w_min = std::min(wMinus, wPlus);
		double w_sum = wPlus - wMinus;
		cout << "W- = " << wMinus << ", W+ = " << wPlus << ", W_min = " << w_min << ", W_sum = " << w_sum << "\n";

		double p_min = 0.5 * GMath::wilcoxonPValue(num, w_min);
		if(num < 10)
			cout << "Because the number of signed ranks is small, you should use a lookup table, rather than rely on the normal approximation for the P-value.\n";
		cout << "One-tailed P-value (for directional comparisons) computed with a normal approximation using W_min = " << 0.5 * p_min << "\n";
		cout << "Two-tailed P-value (for non-directional comparisons) computed with a normal approximation using W_min = " << p_min << "\n";
		cout << "To show that something is \"better\" than something else, use the one-tailed P-value.\n";
		cout << "Commonly, a P-value less that 0.05 is considered to be significant.\n";
/*
			double p_sum = GMath::wilcoxonPValue(num, w_sum);
			cout << "Directional (one-tailed) P-value computed with W_sum = " << p_sum << "\n";
*/
	}
}
Exemplo n.º 23
0
// virtual
GMatrix* GAgglomerativeTransducer::transduceInner(const GMatrix& features1, const GMatrix& labels1, const GMatrix& features2)
{
	// Init the metric
	if(!m_pMetric)
		setMetric(new GRowDistance(), true);
	m_pMetric->init(&features1.relation(), false);

	// Make a dataset with all featuers
	GMatrix featuresAll(features1.relation().clone());
	featuresAll.reserve(features1.rows() + features2.rows());
	GReleaseDataHolder hFeaturesAll(&featuresAll);
	for(size_t i = 0; i < features1.rows(); i++)
		featuresAll.takeRow((double*)features1[i]);
	for(size_t i = 0; i < features2.rows(); i++)
		featuresAll.takeRow((double*)features2[i]);

	// Find enough neighbors to form a connected graph
	GNeighborGraph* pNF = NULL;
	size_t neighbors = 6;
	while(true)
	{
		GKdTree* pKdTree = new GKdTree(&featuresAll, neighbors, m_pMetric, false);
		pNF = new GNeighborGraph(pKdTree, true);
		pNF->fillCache();
		if(pNF->isConnected())
			break;
		if(neighbors + 1 >= featuresAll.rows())
		{
			delete(pNF);
			throw Ex("internal problem--a graph with so many neighbors must be connected");
		}
		neighbors = std::min((neighbors * 3) / 2, featuresAll.rows() - 1);
	}

	// Sort all the neighbors by their distances
	size_t count = featuresAll.rows() * neighbors;
	vector< std::pair<double,size_t> > distNeighs;
	distNeighs.resize(count);
	double* pDistances = pNF->squaredDistanceTable();
	size_t* pRows = pNF->cache();
	size_t index = 0;
	vector< std::pair<double,size_t> >::iterator it = distNeighs.begin();
	for(size_t i = 0; i < count; i++)
	{
		if(*pRows < featuresAll.rows())
		{
			it->first = *pDistances;
			it->second = i;
			it++;
		}
		else
			index--;
		pRows++;
		pDistances++;
	}
	std::sort(distNeighs.begin(), it);

	// Transduce
	GMatrix* pOut = new GMatrix(labels1.relation().clone());
	Holder<GMatrix> hOut(pOut);
	pOut->newRows(features2.rows());
	pOut->setAll(-1);
	size_t* pSiblings = new size_t[featuresAll.rows()]; // a cyclical linked list of each row in the cluster
	ArrayHolder<size_t> hSiblings(pSiblings);
	for(size_t lab = 0; lab < labels1.cols(); lab++)
	{
		// Assign each row to its own cluster
		GIndexVec::makeIndexVec(pSiblings, featuresAll.rows()); // init such that each row is in a cluster of 1
		size_t missingLabels = features2.rows();

		// Merge until we have the desired number of clusters
		pRows = pNF->cache();
		for(vector< std::pair<double,size_t> >::iterator dn = distNeighs.begin(); dn != it; dn++)
		{
			// Get the next two closest points
			size_t a = dn->second / neighbors;
			size_t b = pRows[dn->second];
			GAssert(a != b && a < featuresAll.rows() && b < featuresAll.rows());
			int labelA = (a < features1.rows() ? (int)labels1[a][lab] : (int)pOut->row(a - features1.rows())[lab]);
			int labelB = (b < features1.rows() ? (int)labels1[b][lab] : (int)pOut->row(b - features1.rows())[lab]);

			// Merge the clusters
			if(labelA >= 0 && labelB >= 0)
				continue; // Both points are already labeled, so there is no point in merging their clusters
			if(labelA < 0 && labelB >= 0) // Make sure that if one of them has a valid label, it is point a
			{
				std::swap(a, b);
				std::swap(labelA, labelB);
			}
			if(labelA >= 0)
			{
				for(size_t i = pSiblings[b]; true; i = pSiblings[i]) // Label every row in cluster b
				{
					GAssert(i >= features1.rows());
					GAssert(pOut->row(i - features1.rows())[lab] == (double)-1);
					pOut->row(i - features1.rows())[lab] = labelA;
					missingLabels--;
					if(i == b)
						break;
				}
				if(missingLabels <= 0)
					break;
			}
			std::swap(pSiblings[a], pSiblings[b]); // This line joins the cyclical linked lists into one big cycle
		}
	}
	return hOut.release();
}
Exemplo n.º 24
0
void neuroPCA(GArgReader& args)
{
	// Load the file
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	int nTargetDims = args.pop_uint();

	// Parse options
	string roundTrip;
	unsigned int seed = getpid() * (unsigned int)time(NULL);
	bool trainBias = true;
	bool linear = false;
	string eigenvalues = "";
	while(args.next_is_flag())
	{
		if(args.if_pop("-seed"))
			seed = args.pop_uint();
		else if(args.if_pop("-clampbias"))
			trainBias = false;
		else if(args.if_pop("-linear"))
			linear = true;
		else if(args.if_pop("-eigenvalues"))
			eigenvalues = args.pop_string();
		else
			throw Ex("Invalid option: ", args.peek());
	}

	// Transform the data
	GRand prng(seed);
	GNeuroPCA transform(nTargetDims, &prng);
	if(!trainBias)
		transform.clampBias();
	if(linear)
		transform.setActivation(new GActivationIdentity());
	if(eigenvalues.length() > 0)
		transform.computeEigVals();
	GMatrix* pDataAfter = transform.doit(*pData);
	Holder<GMatrix> hDataAfter(pDataAfter);

	// Save the eigenvalues
	if(eigenvalues.length() > 0)
	{
		GArffRelation* pRelation = new GArffRelation();
		pRelation->addAttribute("eigenvalues", 0, NULL);
		sp_relation pRel = pRelation;
		GMatrix dataEigenvalues(pRel);
		dataEigenvalues.newRows(nTargetDims);
		double* pEigVals = transform.eigVals();
		for(int i = 0; i < nTargetDims; i++)
			dataEigenvalues[i][0] = pEigVals[i];
		dataEigenvalues.saveArff(eigenvalues.c_str());
	}

	// In linear mode, people usually expect normalized eigenvectors, so let's normalize them now
	if(linear)
	{
		GMatrix* pWeights = transform.weights();
		GAssert(pWeights->cols() == pData->cols());
		for(int i = 0; i < nTargetDims; i++)
		{
			double scal = sqrt(GVec::squaredMagnitude(pWeights->row(i + 1), pWeights->cols()));
			for(size_t j = 0; j < pDataAfter->rows(); j++)
				pDataAfter->row(j)[i] *= scal;
		}
	}

	pDataAfter->print(cout);
}