Ejemplo n.º 1
0
GMatrixFactorization* GRecommenderLib::InstantiateMatrixFactorization(GArgReader& args)
{
	if(args.size() < 1)
		throw Ex("The number of intrinsic dims must be specified for this algorithm");
	size_t intrinsicDims = args.pop_uint();
	GMatrixFactorization* pModel = new GMatrixFactorization(intrinsicDims);
	while(args.next_is_flag())
	{
		if(args.if_pop("-regularize"))
			pModel->setRegularizer(args.pop_double());
		else if(args.if_pop("-miniters"))
			pModel->setMinIters(args.pop_uint());
		else if(args.if_pop("-decayrate"))
			pModel->setDecayRate(args.pop_double());
		else if(args.if_pop("-nonneg"))
			pModel->nonNegative();
		else if(args.if_pop("-clampusers"))
		{
			GMatrix tmp;
			tmp.loadArff(args.pop_string());
			size_t offset = args.pop_uint();
			pModel->clampUsers(tmp, offset);
		}
		else if(args.if_pop("-clampitems"))
		{
			GMatrix tmp;
			tmp.loadArff(args.pop_string());
			size_t offset = args.pop_uint();
			pModel->clampItems(tmp, offset);
		}
		else
			throw Ex("Invalid option: ", args.peek());
	}
	return pModel;
}
Ejemplo n.º 2
0
void Loader::loadLaborData2(GMatrix &trainFeat, GMatrix &trainLab, GMatrix &testFeat, GMatrix &testLab)
{
	GMatrix raw;
	raw.loadArff("data/labor_stats.arff");
	
	size_t dims = 1;
	size_t offset = 0;
	size_t train_size = 258;
	size_t test_size = 96;
	
	double *x, *y;
	
	trainFeat.resize(train_size, 1);
	trainLab.resize(train_size, dims);
	testFeat.resize(test_size, 1);
	testLab.resize(test_size, dims);
	
	for(size_t i = 0; i < train_size + test_size; i++)
	{
		if(i < train_size)
		{
			x = trainFeat[i];
			y = trainLab[i];
		}
		else
		{
			x = testFeat[i - train_size];
			y = testLab[i - train_size];
		}
		
		*x = double(i) / train_size;
		*y = raw[offset + i][0];
	}
}
Ejemplo n.º 3
0
void loadData(GMatrix& m, const char* szFilename)
{
	// Load the dataset by extension
	PathData pd;
	GFile::parsePath(szFilename, &pd);
	if(_stricmp(szFilename + pd.extStart, ".arff") == 0)
		m.loadArff(szFilename);
	else if(_stricmp(szFilename + pd.extStart, ".csv") == 0)
	{
		GCSVParser parser;
		parser.parse(m, szFilename);
		cerr << "\nParsing Report:\n";
		for(size_t i = 0; i < m.cols(); i++)
			cerr << to_str(i) << ") " << parser.report(i) << "\n";
	}
	else if(_stricmp(szFilename + pd.extStart, ".dat") == 0)
	{
		GCSVParser parser;
		parser.setSeparator('\0');
		parser.parse(m, szFilename);
		cerr << "\nParsing Report:\n";
		for(size_t i = 0; i < m.cols(); i++)
			cerr << to_str(i) << ") " << parser.report(i) << "\n";
	}
	else
		throw Ex("Unsupported file format: ", szFilename + pd.extStart);
}
Ejemplo n.º 4
0
void Loader::loadMackeyGlassData(GMatrix &trainFeat, GMatrix &trainLab, GMatrix &testFeat, GMatrix &testLab)
{
	GMatrix raw;
	raw.loadArff("data/mackey_glass.arff");
	
	size_t dims = 1;
	size_t offset = 30;
	size_t train_size = 300;
	size_t test_size = 600;
	
	double *x, *y;
	
	trainFeat.resize(train_size, 1);
	trainLab.resize(train_size, dims);
	testFeat.resize(test_size, 1);
	testLab.resize(test_size, dims);
	
	for(size_t i = 0; i < train_size + test_size; i++)
	{
		if(i < train_size)
		{
			x = trainFeat[i];
			y = trainLab[i];
		}
		else
		{
			x = testFeat[i - train_size];
			y = testLab[i - train_size];
		}
		
		*x = double(i) / train_size;
		*y = raw[offset + i][1];
	}
}
Ejemplo n.º 5
0
void GRecommenderLib::loadData(GMatrix& data, const char* szFilename)
{
	PathData pd;
	GFile::parsePath(szFilename, &pd);
	if(_stricmp(szFilename + pd.extStart, ".sparse") == 0)
	{
		GDom doc;
		doc.loadJson(szFilename);
		GSparseMatrix sm(doc.root());
		data.resize(0, 3);
		for(size_t i = 0; i < sm.rows(); i++)
		{
			GSparseMatrix::Iter rowEnd = sm.rowEnd(i);
			for(GSparseMatrix::Iter it = sm.rowBegin(i); it != rowEnd; it++)
			{
				GVec& vec = data.newRow();
				vec[0] = (double)i;
				vec[1] = (double)it->first;
				vec[2] = it->second;
			}
		}
	}
	else if(_stricmp(szFilename + pd.extStart, ".arff") == 0)
		data.loadArff(szFilename);
	else
		throw Ex("Unsupported file format: ", szFilename + pd.extStart);
}
Ejemplo n.º 6
0
void Loader::loadTemperatureData(GMatrix &trainFeat, GMatrix &trainLab, GMatrix &testFeat, GMatrix &testLab)
{
	// TODO: determine optimal granularity
	
	GMatrix raw;
	raw.loadArff("data/melbourne-temperature.arff");
	
	size_t dims = 2;
	size_t offset = 0;
	size_t granularity = 7;
	
	double *x, *y;
	
	trainFeat.resize(TRAIN_SIZE, 1);
	trainLab.resize(TRAIN_SIZE, dims);
	testFeat.resize(TEST_SIZE, 1);
	testLab.resize(TEST_SIZE, dims);
	
	for(size_t i = 0; i < TRAIN_SIZE + TEST_SIZE; i++)
	{
		if(i < TRAIN_SIZE)
		{
			x = trainFeat[i];
			y = trainLab[i];
		}
		else
		{
			x = testFeat[i - TRAIN_SIZE];
			y = testLab[i - TRAIN_SIZE];
		}
		
		x[0] = double(i) / TRAIN_SIZE;
		
		for(size_t j = 0; j < dims; j++)
		{
			y[j] = 0.0;
			for(size_t k = granularity; k < granularity * 2; k++)
			{
				y[j] += raw[offset + i * granularity + k][j + 1];
			}
			y[j] /= double(granularity);
			y[j] *= 0.05;
		}
	}
	
	// Smooth input data
	
	for(size_t i = 1; i < TRAIN_SIZE - 1; i++)
	{
		for(size_t j = 0; j < dims; j++)
		{
			trainLab[i][j] = (trainLab[i-1][j] + trainLab[i][j] + trainLab[i+1][j]) / 3.0;
		}
	}
}
Ejemplo n.º 7
0
void LoadData(GArgReader &args, std::unique_ptr<GMatrix> &hOutput)
{
	// Load the dataset by extension
	if(args.size() < 1)
		throw Ex("Expected the filename of a datset. (Found end of arguments.)");
	const char* szFilename = args.pop_string();
	PathData pd;
	GFile::parsePath(szFilename, &pd);
	GMatrix data;
	vector<size_t> abortedCols;
	vector<size_t> ambiguousCols;
	const char *input_type;
	if (args.next_is_flag() && args.if_pop("-input_type")) {
		input_type = args.pop_string();
	} else { /* deduce it from extension (if any) */
		input_type = szFilename + pd.extStart;
		if (*input_type != '.') /* no extension - assume ARFF */
			input_type = "arff";
		else
			input_type++;
	}
	
	// Now load the data
	if(_stricmp(input_type, "arff") == 0)
	{
		data.loadArff(szFilename);
	}
	else if(_stricmp(input_type, "csv") == 0)
	{
		GCSVParser parser;
		parser.parse(data, szFilename);
		cerr << "\nParsing Report:\n";
		for(size_t i = 0; i < data.cols(); i++)
			cerr << to_str(i) << ") " << parser.report(i) << "\n";
	}
	else if(_stricmp(input_type, "dat") == 0)
	{
		GCSVParser parser;
		parser.setSeparator('\0');
		parser.parse(data, szFilename);
		cerr << "\nParsing Report:\n";
		for(size_t i = 0; i < data.cols(); i++)
			cerr << to_str(i) << ") " << parser.report(i) << "\n";
	}
	else
	{
		throw Ex("Unsupported file format: ", szFilename + pd.extStart);
	}
	
	// Split data into a feature matrix and a label matrix
	GMatrix* pFeatures = data.cloneSub(0, 0, data.rows(), data.cols());
	hOutput.reset(pFeatures);
}
Ejemplo n.º 8
0
GMatrix* loadData(const char* szFilename)
{
	// Load the dataset by extension
	PathData pd;
	GFile::parsePath(szFilename, &pd);
	GMatrix* pData = new GMatrix();
	if(_stricmp(szFilename + pd.extStart, ".arff") == 0)
		pData->loadArff(szFilename);
	else if(_stricmp(szFilename + pd.extStart, ".csv") == 0)
		pData->loadCsv(szFilename, ',', false, false);
	else if(_stricmp(szFilename + pd.extStart, ".dat") == 0)
		pData->loadCsv(szFilename, '\0', false, false);
	else
		throw Ex("Unsupported file format: ", szFilename + pd.extStart);
	return pData;
}
Ejemplo n.º 9
0
void Loader::loadPrecipitationData(GMatrix &trainFeat, GMatrix &trainLab, GMatrix &testFeat, GMatrix &testLab)
{
	// Only 87 samples, so TEST_SIZE should be 23 if TRAIN_SIZE is 64
	
	GMatrix raw;
	raw.loadArff("data/precipitation.arff");
	
	size_t dims = 1;
	double *x, *y;
	
	trainFeat.resize(TRAIN_SIZE, 1);
	trainLab.resize(TRAIN_SIZE, dims);
	testFeat.resize(TEST_SIZE, 1);
	testLab.resize(TEST_SIZE, dims);
	
	for(size_t i = 0; i < TRAIN_SIZE + TEST_SIZE; i++)
	{
		if(i < TRAIN_SIZE)
		{
			x = trainFeat[i];
			y = trainLab[i];
		}
		else
		{
			x = testFeat[i - TRAIN_SIZE];
			y = testLab[i - TRAIN_SIZE];
		}
		
		x[0] = double(i) / TRAIN_SIZE;
		y[0] = raw[i][1] * 0.05;
		// y[1] = raw[i][2] * 0.05;
		// y[2] = raw[i][3] * 0.05;
		// y[3] = raw[i][4] * 0.05;
	}
	
	for(size_t i = 1; i < TRAIN_SIZE - 1; i++)
	{
		for(size_t j = 0; j < dims; j++)
		{
			trainLab[i][j] = (trainLab[i-1][j] + 2 * trainLab[i][j] + trainLab[i+1][j]) * 0.25;
		}
	}
}
Ejemplo n.º 10
0
void Loader::loadSunSpotData(GMatrix &trainFeat, GMatrix &trainLab, GMatrix &testFeat, GMatrix &testLab)
{
	GMatrix raw;
	raw.loadArff("data/sunspots.arff");
	
	size_t dims = 1;
	size_t offset = 0;
	size_t granularity = 6;
	size_t train_size = 240;
	size_t test_size = 240;
	
	double scale = 0.01;
	double *x, *y;
	
	trainFeat.resize(train_size, 1);
	trainLab.resize(train_size, dims);
	testFeat.resize(test_size, 1);
	testLab.resize(test_size, dims);
	
	for(size_t i = 0; i < train_size + test_size; i++)
	{
		if(i < train_size)
		{
			x = trainFeat[i];
			y = trainLab[i];
		}
		else
		{
			x = testFeat[i - train_size];
			y = testLab[i - train_size];
		}
		
		*x = double(i) / train_size;
		
		*y = 0.0;
		for(size_t j = 0; j < granularity; j++)
		{
			*y += scale * raw[offset + granularity * i + j][3];
		}
		*y /= granularity;
	}
}
Ejemplo n.º 11
0
void Loader::loadStockData(GMatrix &trainFeat, GMatrix &trainLab, GMatrix &testFeat, GMatrix &testLab)
{
	// 3   - AAPL
	// 108 - MSFT
	// 157 - DJI
	
	GMatrix raw;
	raw.loadArff("data/stocks.arff");
	
	size_t dims = 1;
	size_t offset = 600;
	size_t train_size = 500;
	size_t test_size = 300;
	double *x, *y;
	
	trainFeat.resize(train_size, 1);
	trainLab.resize(train_size, dims);
	testFeat.resize(test_size, 1);
	testLab.resize(test_size, dims);
	
	double log_10 = log(10);
	double vert_offset = 3.9;
	double scale = 20;
	
	for(size_t i = 0; i < train_size + test_size; i++)
	{
		if(i < train_size)
		{
			x = trainFeat[i];
			y = trainLab[i];
		}
		else
		{
			x = testFeat[i - train_size];
			y = testLab[i - train_size];
		}
		
		x[0] = double(i) / train_size;
		y[0] = ((log(raw[offset + i][157]) / log_10) - vert_offset) * scale;
	}
}
Ejemplo n.º 12
0
void Loader::loadAirPassengerData(GMatrix &trainFeat, GMatrix &trainLab, GMatrix &testFeat, GMatrix &testLab)
{
	GMatrix raw;
	raw.loadArff("data/air_passengers.arff");
	
	size_t dims = 1;
	size_t offset = 0;
	size_t train_size = 72;
	size_t test_size = 72;
	
	double *x, *y;
	
	trainFeat.resize(train_size, 1);
	trainLab.resize(train_size, dims);
	testFeat.resize(test_size, 1);
	testLab.resize(test_size, dims);
	
	double log_10 = log(10);
	double vert_offset = 2;
	double scale = 10;//0.1;
	
	for(size_t i = 0; i < train_size + test_size; i++)
	{
		if(i < train_size)
		{
			x = trainFeat[i];
			y = trainLab[i];
		}
		else
		{
			x = testFeat[i - train_size];
			y = testLab[i - train_size];
		}
		
		*x = double(i) / train_size;
		*y = ((log(scale * raw[offset + i][0]) / log_10) - vert_offset);
	}
	
	trainLab.saveArff("out/train.arff");
	testLab.saveArff("out/test.arff");
}
Ejemplo n.º 13
0
GSparseMatrix* GRecommenderLib::loadSparseData(const char* szFilename)
{
	// Load the dataset by extension
	PathData pd;
	GFile::parsePath(szFilename, &pd);
	if(_stricmp(szFilename + pd.extStart, ".arff") == 0)
	{
		// Convert a 3-column dense ARFF file to a sparse matrix
		GMatrix data;
		data.loadArff(szFilename);
		if(data.cols() != 3)
			throw Ex("Expected 3 columns: 0) user or row-index, 1) item or col-index, 2) value or rating");
		double m0 = data.columnMin(0);
		double r0 = data.columnMax(0) - m0;
		double m1 = data.columnMin(1);
		double r1 = data.columnMax(1) - m1;
		if(m0 < 0 || m0 > 1e10 || r0 < 2 || r0 > 1e10)
			throw Ex("Invalid row indexes");
		if(m1 < 0 || m1 > 1e10 || r1 < 2 || r1 > 1e10)
			throw Ex("Invalid col indexes");
		GSparseMatrix* pMatrix = new GSparseMatrix(size_t(m0 + r0) + 1, size_t(m1 + r1) + 1, UNKNOWN_REAL_VALUE);
		std::unique_ptr<GSparseMatrix> hMatrix(pMatrix);
		for(size_t i = 0; i < data.rows(); i++)
		{
			GVec& row = data.row(i);
			pMatrix->set(size_t(row[0]), size_t(row[1]), row[2]);
		}
		return hMatrix.release();
	}
	else if(_stricmp(szFilename + pd.extStart, ".sparse") == 0)
	{
		GDom doc;
		doc.loadJson(szFilename);
		return new GSparseMatrix(doc.root());
	}
	throw Ex("Unsupported file format: ", szFilename + pd.extStart);
	return NULL;
}
Ejemplo n.º 14
0
void Loader::loadOzoneData(GMatrix &trainFeat, GMatrix &trainLab, GMatrix &testFeat, GMatrix &testLab)
{
	GMatrix raw;
	raw.loadArff("data/mhsets_monthly-ozone.arff");
	
	size_t dims = 1;
	size_t offset = 0;
	size_t train_size = 108;
	size_t test_size = 44;
	
	double *x, *y;
	
	trainFeat.resize(train_size, 1);
	trainLab.resize(train_size, dims);
	testFeat.resize(test_size, 1);
	testLab.resize(test_size, dims);
	
	for(size_t i = 0; i < train_size + test_size; i++)
	{
		if(i < train_size)
		{
			x = trainFeat[i];
			y = trainLab[i];
		}
		else
		{
			x = testFeat[i - train_size];
			y = testLab[i - train_size];
		}
		
		*x = double(i) / train_size;
		*y = log(raw[offset + i][0]) / log(10);
	}
	
	trainLab.saveArff("out/train.arff");
	testLab.saveArff("out/test.arff");
}
Ejemplo n.º 15
0
void doit()
{
	// Load the data
	GMatrix trainLab;
	GMatrix testLab;
	if (chdir("../bin") != 0)
	{
	}
	trainLab.loadArff("train.arff");
	testLab.loadArff("test.arff");
	double dataMin = trainLab.columnMin(0);
	double dataMax = trainLab.columnMax(0);
	trainLab.normalizeColumn(0, dataMin, dataMax, -5.0, 5.0);
	testLab.normalizeColumn(0, dataMin, dataMax, -5.0, 5.0);
	GMatrix trainFeat(trainLab.rows(), 1);
	for(size_t i = 0; i < trainLab.rows(); i++)
		trainFeat[i][0] = (double)i / trainLab.rows() - 0.5;
	GMatrix testFeat(testLab.rows(), 1);
	for(size_t i = 0; i < testLab.rows(); i++)
		testFeat[i][0] = (double)(i + trainLab.rows()) / trainLab.rows() - 0.5;

	// Make a neural network
	GNeuralNet nn;
	GUniformRelation relOne(1);
	nn.beginIncrementalLearning(relOne, relOne);

	// Initialize the weights of the sine units to match the frequencies used by the Fourier transform.
	GLayerClassic* pSine2 = new GLayerClassic(1, 64, new GActivationSin());
	GMatrix& wSin = pSine2->weights();
	GVec& bSin = pSine2->bias();
	for(size_t i = 0; i < pSine2->outputs() / 2; i++)
	{
		wSin[0][2 * i] = 2.0 * M_PI * (i + 1);
		bSin[2 * i] = 0.5 * M_PI;
		wSin[0][2 * i + 1] = 2.0 * M_PI * (i + 1);
		bSin[2 * i + 1] = M_PI;
	}

	// Make the hidden layer
	GLayerMixed* pMix2 = new GLayerMixed();
	pSine2->resize(1, pSine2->outputs(), &nn.rand(), PERTURBATION);
	pMix2->addComponent(pSine2);
	GLayerClassic* pSoftPlus2 = new GLayerClassic(1, SOFTPLUS_NODES, new GActivationSoftPlus());
	pMix2->addComponent(pSoftPlus2);
	GLayerClassic* pIdentity2 = new GLayerClassic(1, IDENTITY_NODES, new GActivationIdentity());
	pMix2->addComponent(pIdentity2);
	nn.addLayer(pMix2);

	// Make the output layer
	GLayerClassic* pIdentity3 = new GLayerClassic(FLEXIBLE_SIZE, trainLab.cols(), new GActivationIdentity());
	pIdentity3->resize(pMix2->outputs(), pIdentity3->outputs(), &nn.rand(), PERTURBATION);
	nn.addLayer(pIdentity3);

	// Initialize all the non-periodic nodes to approximate the identity function, then perturb a little bit
	pSoftPlus2->setWeightsToIdentity();
	for(size_t i = 0; i < SOFTPLUS_NODES; i++)
	{
		pSoftPlus2->bias()[i] += SOFTPLUS_SHIFT;
		pIdentity3->renormalizeInput(pSine2->outputs() + i, 0.0, 1.0, SOFTPLUS_SHIFT, SOFTPLUS_SHIFT + 1.0);
	}
	pIdentity2->setWeightsToIdentity();
	pSoftPlus2->perturbWeights(nn.rand(), PERTURBATION);
	pIdentity2->perturbWeights(nn.rand(), PERTURBATION);

	// Randomly initialize the weights on the output layer
	pIdentity3->weights().setAll(0.0);
	pIdentity3->perturbWeights(nn.rand(), PERTURBATION);

	// Open Firefox to view the progress
	GApp::systemCall("firefox ./view.html#progress.svg", false, true);

	// Do some training
	GRandomIndexIterator ii(trainLab.rows(), nn.rand());
	nn.setLearningRate(LEARNING_RATE);
	for(size_t epoch = 0; epoch < TRAINING_EPOCHS; epoch++)
	{
		// Visit each sample in random order
		ii.reset();
		size_t i;
		while(ii.next(i))
		{
			// Regularize
			pIdentity3->scaleWeights(1.0 - nn.learningRate() * REGULARIZATION_TERM, true);
			pIdentity3->diminishWeights(nn.learningRate() * REGULARIZATION_TERM, true);

			// Train
			nn.trainIncremental(trainFeat[i], trainLab[i]); // One iteration of stochastic gradient descent
		}

		// Report progress
		double rmse = sqrt(nn.sumSquaredError(trainFeat, trainLab) / trainLab.rows());
		if(epoch % (TRAINING_EPOCHS / 100) == 0)
		{
			double val = sqrt(nn.sumSquaredError(testFeat, testLab) / testLab.rows());
			cout << "prog=" << to_str((double)epoch * 100.0 / TRAINING_EPOCHS) << "%	rmse=" << to_str(rmse) << "	val=" << to_str(val) << "\n";
			plot_it("progress.svg", nn, trainFeat, trainLab, testFeat, testLab);
		}
	}
}
Ejemplo n.º 16
0
void unsupervisedBackProp(GArgReader& args)
{
	// Load the file and params
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	int targetDims = args.pop_uint();

	// Parse Options
	unsigned int nSeed = getpid() * (unsigned int)time(NULL);
	GRand prng(nSeed);
	GUnsupervisedBackProp* pUBP = new GUnsupervisedBackProp(targetDims, &prng);
	Holder<GUnsupervisedBackProp> hUBP(pUBP);
	vector<size_t> paramRanges;
	string sModelOut;
	string sProgress;
	bool inputBias = true;
	while(args.size() > 0)
	{
		if(args.if_pop("-seed"))
			prng.setSeed(args.pop_uint());
		else if(args.if_pop("-addlayer"))
			pUBP->neuralNet()->addLayer(args.pop_uint());
		else if(args.if_pop("-params"))
		{
			if(pUBP->jitterer())
				throw Ex("You can't change the params after you add an image jitterer");
			size_t paramDims = args.pop_uint();
			for(size_t i = 0; i < paramDims; i++)
				paramRanges.push_back(args.pop_uint());
		}
		else if(args.if_pop("-modelin"))
		{
			GDom doc;
			doc.loadJson(args.pop_string());
			GLearnerLoader ll(prng);
			pUBP = new GUnsupervisedBackProp(doc.root(), ll);
			hUBP.reset(pUBP);
		}
		else if(args.if_pop("-modelout"))
			sModelOut = args.pop_string();
		else if(args.if_pop("-intrinsicin"))
		{
			GMatrix* pInt = new GMatrix();
			pInt->loadArff(args.pop_string());
			pUBP->setIntrinsic(pInt);
		}
		else if(args.if_pop("-jitter"))
		{
			if(paramRanges.size() != 2)
				throw Ex("The params must be set to 2 before a tweaker is set");
			size_t channels = args.pop_uint();
			double rot = args.pop_double();
			double trans = args.pop_double();
			double zoom = args.pop_double();
			GImageJitterer* pJitterer = new GImageJitterer(paramRanges[0], paramRanges[1], channels, rot, trans, zoom);
			pUBP->setJitterer(pJitterer);
		}
		else if(args.if_pop("-noinputbias"))
			inputBias = false;
		else if(args.if_pop("-progress"))
		{
			sProgress = args.pop_string();
			pUBP->trackProgress();
		}
		else if(args.if_pop("-onepass"))
			pUBP->onePass();
		else
			throw Ex("Invalid option: ", args.peek());
	}
	pUBP->setParams(paramRanges);
	pUBP->setUseInputBias(inputBias);

	// Transform the data
	GMatrix* pDataAfter = pUBP->doit(*pData);
	Holder<GMatrix> hDataAfter(pDataAfter);
	pDataAfter->print(cout);

	// Save the model (if requested)
	if(sModelOut.length() > 0)
	{
		GDom doc;
		doc.setRoot(pUBP->serialize(&doc));
		doc.saveJson(sModelOut.c_str());
	}
	if(sProgress.length() > 0)
		pUBP->progress().saveArff(sProgress.c_str());
}
Ejemplo n.º 17
0
void GRecommenderLib::fillMissingValues(GArgReader& args)
{
	unsigned int seed = getpid() * (unsigned int)time(NULL);
	bool normalize = true;
	while(args.next_is_flag())
	{
		if(args.if_pop("-seed"))
			seed = args.pop_uint();
		else if(args.if_pop("-nonormalize"))
			normalize = false;
		else
			throw Ex("Invalid option: ", args.peek());
	}

	// Load the data and the filter
	GMatrix dataOrig;
	dataOrig.loadArff(args.pop_string());

	// Parse params
	vector<size_t> ignore;
	while(args.next_is_flag())
	{
		if(args.if_pop("-ignore"))
			parseAttributeList(ignore, args, dataOrig.cols());
		else
			throw Ex("Invalid option: ", args.peek());
	}

	// Throw out the ignored attributes
	std::sort(ignore.begin(), ignore.end());
	for(size_t i = ignore.size() - 1; i < ignore.size(); i--)
		dataOrig.deleteColumns(ignore[i], 1);

	GRelation* pOrigRel = dataOrig.relation().clone();
	std::unique_ptr<GRelation> hOrigRel(pOrigRel);
	GCollaborativeFilter* pModel = InstantiateAlgorithm(args);
	std::unique_ptr<GCollaborativeFilter> hModel(pModel);
	if(args.size() > 0)
		throw Ex("Superfluous argument: ", args.peek());
	pModel->rand().setSeed(seed);

	// Convert to all normalized real values
	GNominalToCat* pNtc = new GNominalToCat();
	GIncrementalTransform* pFilter = pNtc;
	std::unique_ptr<GIncrementalTransformChainer> hChainer;
	if(normalize)
	{
		GIncrementalTransformChainer* pChainer = new GIncrementalTransformChainer(new GNormalize(), pNtc);
		hChainer.reset(pChainer);
		pFilter = pChainer;
	}
	pNtc->preserveUnknowns();
	pFilter->train(dataOrig);
	GMatrix* pData = pFilter->transformBatch(dataOrig);
	std::unique_ptr<GMatrix> hData(pData);

	// Convert to 3-column form
	GMatrix* pMatrix = new GMatrix(0, 3);
	std::unique_ptr<GMatrix> hMatrix(pMatrix);
	size_t dims = pData->cols();
	for(size_t i = 0; i < pData->rows(); i++)
	{
		GVec& row = pData->row(i);
		for(size_t j = 0; j < dims; j++)
		{
			if(row[j] != UNKNOWN_REAL_VALUE)
			{
				GVec& vec = pMatrix->newRow();
				vec[0] = (double)i;
				vec[1] = (double)j;
				vec[2] = row[j];
			}
		}
	}

	// Train the collaborative filter
	pModel->train(*pMatrix);
	hMatrix.release();
	pMatrix = NULL;

	// Predict values for missing elements
	for(size_t i = 0; i < pData->rows(); i++)
	{
		GVec& row = pData->row(i);
		for(size_t j = 0; j < dims; j++)
		{
			if(row[j] == UNKNOWN_REAL_VALUE)
				row[j] = pModel->predict(i, j);
			GAssert(row[j] != UNKNOWN_REAL_VALUE);
		}
	}

	// Convert the data back to its original form
	GMatrix* pOut = pFilter->untransformBatch(*pData);
	pOut->setRelation(hOrigRel.release());
	pOut->print(cout);
}
Ejemplo n.º 18
0
///Return a pointer to newly allocated data read from the command line
///represented by args.
///
///The returned matrix is allocated by new and it is the caller's
///responsibility to deallocate it. The suggested manner is to use a
///Holder<GMatrix*>
///
///In the returned matrix, all of the attributes designated as labels
///have been moved to the end and ignored attributes have been
///removed. The original indices of all the attributes are returned in
///originalIndices.
///
///\param args the command-line arguments
///
///\param pLabelDims (out parameter) the index of the first attribute
///which is designated a label.
///
///\param originalIndices the vector in which to place the original
///indices.  originalIndices[i] is the index in the original data file
///of the attribute currently at index i.
void loadDataWithSwitches(GMatrix& data, GArgReader& args, size_t& pLabelDims,
			      std::vector<size_t>& originalIndices)
{
	// Load the dataset by extension
	const char* szFilename = args.pop_string();
	PathData pd;
	GFile::parsePath(szFilename, &pd);
	if(_stricmp(szFilename + pd.extStart, ".arff") == 0)
		data.loadArff(szFilename);
	else if(_stricmp(szFilename + pd.extStart, ".csv") == 0)
		data.loadCsv(szFilename, ',', false, false);
	else if(_stricmp(szFilename + pd.extStart, ".dat") == 0)
		data.loadCsv(szFilename, '\0', false, false);
	else
		throw Ex("Unsupported file format: ", szFilename + pd.extStart);

	//Make the initial list of original indices
	originalIndices.resize(data.cols());
	for(std::size_t i = 0; i < originalIndices.size(); ++i){
	  originalIndices.at(i) = i;
	}

	// Parse params
	vector<size_t> ignore;
	vector<size_t> labels;
	while(args.next_is_flag())
	{
		if(args.if_pop("-labels"))
			parseAttributeList(labels, args, data.cols());
		else if(args.if_pop("-ignore"))
			parseAttributeList(ignore, args, data.cols());
		else
			break;
	}

	// Throw out the ignored attributes
	std::sort(ignore.begin(), ignore.end());
	for(size_t i = ignore.size() - 1; i < ignore.size(); i--)
	{
		data.deleteColumn(ignore[i]);
		originalIndices.erase(originalIndices.begin()+ignore[i]);
		for(size_t j = 0; j < labels.size(); j++)
		{
			if(labels[j] >= ignore[i])
			{
				if(labels[j] == ignore[i])
					throw Ex("Attribute ", to_str(labels[j]), " is both ignored and used as a label");
				labels[j]--;
			}
		}
	}

	// Swap label columns to the end
	pLabelDims = std::max((size_t)1, labels.size());
	for(size_t i = 0; i < labels.size(); i++)
	{
		size_t src = labels[i];
		size_t dst = data.cols() - pLabelDims + i;
		if(src != dst)
		{
			data.swapColumns(src, dst);
			std::swap(originalIndices.at(src),
				  originalIndices.at(dst));
			for(size_t j = i + 1; j < labels.size(); j++)
			{
				if(labels[j] == dst)
				{
					labels[j] = src;
					break;
				}
			}
		}
	}
}