Example #1
0
void crossValidate(GArgReader& args)
{
	// Parse options
	unsigned int seed = getpid() * (unsigned int)time(NULL);
	size_t folds = 2;
	while(args.next_is_flag())
	{
		if(args.if_pop("-seed"))
			seed = args.pop_uint();
		else if(args.if_pop("-folds"))
			folds = args.pop_uint();
		else
			ThrowError("Invalid crossvalidate option: ", args.peek());
	}
	if(folds < 2)
		ThrowError("There must be at least 2 folds.");

	// Load the data
	if(args.size() < 1)
		ThrowError("No dataset specified.");
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);

	// Instantiate the recommender
	GRand prng(seed);
	GCollaborativeFilter* pModel = InstantiateAlgorithm(prng, args);
	Holder<GCollaborativeFilter> hModel(pModel);
	if(args.size() > 0)
		ThrowError("Superfluous argument: ", args.peek());

	// Do cross-validation
	double mae;
	double mse = pModel->crossValidate(*pData, folds, &mae);
	cout << "RMSE=" << sqrt(mse) << ", MSE=" << mse << ", MAE=" << mae << "\n";
}
Example #2
0
void precisionRecall(GArgReader& args)
{
	// Parse options
	unsigned int seed = getpid() * (unsigned int)time(NULL);
	bool ideal = false;
	while(args.next_is_flag())
	{
		if(args.if_pop("-seed"))
			seed = args.pop_uint();
		else if(args.if_pop("-ideal"))
			ideal = true;
		else
			ThrowError("Invalid option: ", args.peek());
	}

	// Load the data
	if(args.size() < 1)
		ThrowError("No dataset specified.");
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);

	// Instantiate the recommender
	GRand prng(seed);
	GCollaborativeFilter* pModel = InstantiateAlgorithm(prng, args);
	Holder<GCollaborativeFilter> hModel(pModel);
	if(args.size() > 0)
		ThrowError("Superfluous argument: ", args.peek());

	// Generate precision-recall data
	GMatrix* pResults = pModel->precisionRecall(*pData, ideal);
	Holder<GMatrix> hResults(pResults);
	pResults->deleteColumn(2); // we don't need the false-positive rate column
	pResults->print(cout);
}
Example #3
0
void GRecommenderLib::precisionRecall(GArgReader& args)
{
	// Parse options
	unsigned int seed = getpid() * (unsigned int)time(NULL);
	bool ideal = false;
	while(args.next_is_flag())
	{
		if(args.if_pop("-seed"))
			seed = args.pop_uint();
		else if(args.if_pop("-ideal"))
			ideal = true;
		else
			throw Ex("Invalid option: ", args.peek());
	}

	// Load the data
	if(args.size() < 1)
		throw Ex("No dataset specified.");
	GMatrix data;
	loadData(data, args.pop_string());

	// Instantiate the recommender
	GCollaborativeFilter* pModel = InstantiateAlgorithm(args);
	std::unique_ptr<GCollaborativeFilter> hModel(pModel);
	if(args.size() > 0)
		throw Ex("Superfluous argument: ", args.peek());
	pModel->rand().setSeed(seed);

	// Generate precision-recall data
	GMatrix* pResults = pModel->precisionRecall(data, ideal);
	std::unique_ptr<GMatrix> hResults(pResults);
	pResults->deleteColumns(2, 1); // we don't need the false-positive rate column
	pResults->print(cout);
}
Example #4
0
void GRecommenderLib::transacc(GArgReader& args)
{
	// Parse options
	unsigned int seed = getpid() * (unsigned int)time(NULL);
	while(args.next_is_flag())
	{
		if(args.if_pop("-seed"))
			seed = args.pop_uint();
		else
			throw Ex("Invalid crossvalidate option: ", args.peek());
	}

	// Load the data
	if(args.size() < 1)
		throw Ex("No training set specified.");
	GMatrix train;
	loadData(train, args.pop_string());
	if(args.size() < 1)
		throw Ex("No test set specified.");
	GMatrix test;
	loadData(test, args.pop_string());

	// Instantiate the recommender
	GCollaborativeFilter* pModel = InstantiateAlgorithm(args);
	std::unique_ptr<GCollaborativeFilter> hModel(pModel);
	if(args.size() > 0)
		throw Ex("Superfluous argument: ", args.peek());
	pModel->rand().setSeed(seed);

	// Do cross-validation
	double mae;
	double mse = pModel->trainAndTest(train, test, &mae);
	cout << "MSE=" << mse << ", MAE=" << mae << "\n";
}
Example #5
0
void transacc(GArgReader& args)
{
	// Parse options
	unsigned int seed = getpid() * (unsigned int)time(NULL);
	while(args.next_is_flag())
	{
		if(args.if_pop("-seed"))
			seed = args.pop_uint();
		else
			ThrowError("Invalid crossvalidate option: ", args.peek());
	}

	// Load the data
	if(args.size() < 1)
		ThrowError("No training set specified.");
	GMatrix* pTrain = loadData(args.pop_string());
	Holder<GMatrix> hTrain(pTrain);
	if(args.size() < 1)
		ThrowError("No test set specified.");
	GMatrix* pTest = loadData(args.pop_string());
	Holder<GMatrix> hTest(pTest);

	// Instantiate the recommender
	GRand prng(seed);
	GCollaborativeFilter* pModel = InstantiateAlgorithm(prng, args);
	Holder<GCollaborativeFilter> hModel(pModel);
	if(args.size() > 0)
		ThrowError("Superfluous argument: ", args.peek());

	// Do cross-validation
	double mae;
	double mse = pModel->trainAndTest(*pTrain, *pTest, &mae);
	cout << "MSE=" << mse << ", MAE=" << mae << "\n";
}
Example #6
0
void addNoise(GArgReader& args)
{
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	double dev = args.pop_double();

	// Parse the options
	unsigned int seed = getpid() * (unsigned int)time(NULL);
	int excludeLast = 0;
	while(args.next_is_flag())
	{
		if(args.if_pop("-seed"))
			seed = args.pop_uint();
		else if(args.if_pop("-excludelast"))
			excludeLast = args.pop_uint();
		else
			ThrowError("Invalid neighbor finder option: ", args.peek());
	}

	GRand prng(seed);
	size_t cols = pData->cols() - excludeLast;
	for(size_t r = 0; r < pData->rows(); r++)
	{
		double* pRow = pData->row(r);
		for(size_t c = 0; c < cols; c++)
			*(pRow++) += dev * prng.normal();
	}
	pData->print(cout);
}
Example #7
0
void LoadData(GArgReader &args, std::unique_ptr<GMatrix> &hOutput)
{
	// Load the dataset by extension
	if(args.size() < 1)
		throw Ex("Expected the filename of a datset. (Found end of arguments.)");
	const char* szFilename = args.pop_string();
	PathData pd;
	GFile::parsePath(szFilename, &pd);
	GMatrix data;
	vector<size_t> abortedCols;
	vector<size_t> ambiguousCols;
	const char *input_type;
	if (args.next_is_flag() && args.if_pop("-input_type")) {
		input_type = args.pop_string();
	} else { /* deduce it from extension (if any) */
		input_type = szFilename + pd.extStart;
		if (*input_type != '.') /* no extension - assume ARFF */
			input_type = "arff";
		else
			input_type++;
	}
	
	// Now load the data
	if(_stricmp(input_type, "arff") == 0)
	{
		data.loadArff(szFilename);
	}
	else if(_stricmp(input_type, "csv") == 0)
	{
		GCSVParser parser;
		parser.parse(data, szFilename);
		cerr << "\nParsing Report:\n";
		for(size_t i = 0; i < data.cols(); i++)
			cerr << to_str(i) << ") " << parser.report(i) << "\n";
	}
	else if(_stricmp(input_type, "dat") == 0)
	{
		GCSVParser parser;
		parser.setSeparator('\0');
		parser.parse(data, szFilename);
		cerr << "\nParsing Report:\n";
		for(size_t i = 0; i < data.cols(); i++)
			cerr << to_str(i) << ") " << parser.report(i) << "\n";
	}
	else
	{
		throw Ex("Unsupported file format: ", szFilename + pd.extStart);
	}
	
	// Split data into a feature matrix and a label matrix
	GMatrix* pFeatures = data.cloneSub(0, 0, data.rows(), data.cols());
	hOutput.reset(pFeatures);
}
Example #8
0
void dropRandomValues(GArgReader& args)
{
	GMatrix* pData = loadData(args.pop_string());
	double portion = args.pop_double();

	// Parse the options
	unsigned int seed = getpid() * (unsigned int)time(NULL);
	while(args.next_is_flag())
	{
		if(args.if_pop("-seed"))
			seed = args.pop_uint();
		else
			ThrowError("Invalid option: ", args.peek());
	}

	GRand rand(seed);
	size_t n = pData->rows() * pData->cols();
	size_t k = size_t(portion * n);
	for(size_t i = 0; i < pData->cols(); i++)
	{
		size_t vals = pData->relation()->valueCount(i);
		if(vals == 0)
		{
			for(size_t j = 0; j < pData->rows(); j++)
			{
				if(rand.next(n) < k)
				{
					pData->row(j)[i] = UNKNOWN_REAL_VALUE;
					k--;
				}
				n--;
			}
		}
		else
		{
			for(size_t j = 0; j < pData->rows(); j++)
			{
				if(rand.next(n) < k)
				{
					pData->row(j)[i] = UNKNOWN_DISCRETE_VALUE;
					k--;
				}
				n--;
			}
		}
	}
	pData->print(cout);
}
Example #9
0
void attributeSelector(GArgReader& args)
{
	// Load the data
	size_t labelDims;
	std::vector<size_t> originalIndices;
	GMatrix data;
	loadDataWithSwitches(data, args, labelDims, originalIndices);

	// Parse the options
	unsigned int seed = getpid() * (unsigned int)time(NULL);
	int targetFeatures = 1;
	string outFilename = "";
	while(args.next_is_flag())
	{
		if(args.if_pop("-seed"))
			seed = args.pop_uint();
		else if(args.if_pop("-out"))
		{
			targetFeatures = args.pop_uint();
			outFilename = args.pop_string();
		}
		else
			throw Ex("Invalid neighbor finder option: ", args.peek());
	}

	// Do the attribute selection
	GRand prng(seed);
	GAttributeSelector as(labelDims, targetFeatures, &prng);
	if(outFilename.length() > 0)
	{
		as.train(data);
		GMatrix* pDataOut = as.transformBatch(data);
		Holder<GMatrix> hDataOut(pDataOut);
		cout << "Reduced data saved to " << outFilename.c_str() << ".\n";
		pDataOut->saveArff(outFilename.c_str());
	}
	else
		as.train(data);
	cout << "\nAttribute rankings from most salient to least salient. (Attributes are zero-indexed.)\n";
	GArffRelation* pRel = (GArffRelation*)data.relation().get();
	for(size_t i = 0; i < as.ranks().size(); i++)
	  cout << originalIndices.at(as.ranks()[i]) << " " << pRel->attrName(as.ranks()[i]) << "\n";
}
Example #10
0
void Extrapolate(GArgReader &args)
{
	// Load the model
	if(args.size() < 1)
	{
		throw Ex("Model not specified.");
	}
	GDom doc;
	doc.loadJson(args.pop_string());
	GLearnerLoader ll(true);
	GSupervisedLearner *pLearner = ll.loadLearner(doc.root());
	std::unique_ptr<GSupervisedLearner> hLearner(pLearner);
	
	// Parse options
	
	double start = 1.0;
	double length = 1.0;
	double step = 0.0002;
	bool useFeatures = false;
	bool outputFeatures = true;
	
	GNeuralDecomposition *nd = (GNeuralDecomposition *) pLearner;
	std::unique_ptr<GMatrix> hFeatures;
	
	while(args.next_is_flag())
	{
		if(args.if_pop("-start"))
		{
			start = args.pop_double();
		}
		else if(args.if_pop("-length"))
		{
			length = args.pop_double();
		}
		else if(args.if_pop("-step"))
		{
			step = args.pop_double();
		}
		else if(args.if_pop("-features"))
		{
			LoadData(args, hFeatures);
			useFeatures = true;
		}
		else if(args.if_pop("-outputFeatures"))
		{
			outputFeatures = true;
		}
		else
		{
			throw Ex("Invalid option: ", args.peek());
		}
	}
	
	// Extrapolate
	GMatrix *pOutput;
	if(useFeatures)
		pOutput = nd->extrapolate(*hFeatures.get());
	else
		pOutput = nd->extrapolate(start, length, step, outputFeatures);
	std::unique_ptr<GMatrix> hOutput(pOutput);
	
	// Output predictions
	pOutput->print(cout);
}
Example #11
0
void Train(GArgReader &args)
{
	// Load series from file
	std::unique_ptr<GMatrix> hSeries, hFeatures;
	LoadData(args, hSeries);
	GMatrix *pSeries = hSeries.get();
	
	// Split features/labels
	if(pSeries->cols() == 2)
	{
		GMatrix *pFeatures = pSeries->cloneSub(0, 0, pSeries->rows(), 1);
		GMatrix *pLabels = pSeries->cloneSub(0, 1, pSeries->rows(), 1);
		hFeatures.reset(pFeatures);
		hSeries.reset(pLabels);
		pSeries = pLabels;
	}
	else if(pSeries->cols() > 2)
	{
		throw Ex("Too many columns!");
	}
	
	// Parse options
	GNeuralDecomposition *nd = new GNeuralDecomposition();
	while(args.next_is_flag())
	{
		if(args.if_pop("-regularization"))
			nd->setRegularization(args.pop_double());
		else if(args.if_pop("-learningRate"))
			nd->setLearningRate(args.pop_double());
		else if(args.if_pop("-linearUnits"))
			nd->setLinearUnits(args.pop_uint());
		else if(args.if_pop("-softplusUnits"))
			nd->setSoftplusUnits(args.pop_uint());
		else if(args.if_pop("-sigmoidUnits"))
			nd->setSigmoidUnits(args.pop_uint());
		else if(args.if_pop("-epochs"))
			nd->setEpochs(args.pop_uint());
		else if(args.if_pop("-features"))
			LoadData(args, hFeatures);
		else if(args.if_pop("-filterLogarithm"))
			nd->setFilterLogarithm(true);
		else
			throw Ex("Invalid option: ", args.peek());
	}
	
	if(hFeatures.get() == NULL)
	{
		// Generate features
		GMatrix *pFeatures = new GMatrix(pSeries->rows(), 1);
		for(size_t i = 0; i < pSeries->rows(); i++)
		{
			pFeatures->row(i)[0] = i / (double) pSeries->rows();
		}
		hFeatures.reset(pFeatures);
	}
	
	// Train
	GMatrix *pFeatures = hFeatures.get();
	nd->train(*pFeatures, *pSeries);
	
	// Output the trained model
	GDom doc;
	doc.setRoot(nd->serialize(&doc));
	doc.writeJson(cout);
}
Example #12
0
void GRecommenderLib::fillMissingValues(GArgReader& args)
{
	unsigned int seed = getpid() * (unsigned int)time(NULL);
	bool normalize = true;
	while(args.next_is_flag())
	{
		if(args.if_pop("-seed"))
			seed = args.pop_uint();
		else if(args.if_pop("-nonormalize"))
			normalize = false;
		else
			throw Ex("Invalid option: ", args.peek());
	}

	// Load the data and the filter
	GMatrix dataOrig;
	dataOrig.loadArff(args.pop_string());

	// Parse params
	vector<size_t> ignore;
	while(args.next_is_flag())
	{
		if(args.if_pop("-ignore"))
			parseAttributeList(ignore, args, dataOrig.cols());
		else
			throw Ex("Invalid option: ", args.peek());
	}

	// Throw out the ignored attributes
	std::sort(ignore.begin(), ignore.end());
	for(size_t i = ignore.size() - 1; i < ignore.size(); i--)
		dataOrig.deleteColumns(ignore[i], 1);

	GRelation* pOrigRel = dataOrig.relation().clone();
	std::unique_ptr<GRelation> hOrigRel(pOrigRel);
	GCollaborativeFilter* pModel = InstantiateAlgorithm(args);
	std::unique_ptr<GCollaborativeFilter> hModel(pModel);
	if(args.size() > 0)
		throw Ex("Superfluous argument: ", args.peek());
	pModel->rand().setSeed(seed);

	// Convert to all normalized real values
	GNominalToCat* pNtc = new GNominalToCat();
	GIncrementalTransform* pFilter = pNtc;
	std::unique_ptr<GIncrementalTransformChainer> hChainer;
	if(normalize)
	{
		GIncrementalTransformChainer* pChainer = new GIncrementalTransformChainer(new GNormalize(), pNtc);
		hChainer.reset(pChainer);
		pFilter = pChainer;
	}
	pNtc->preserveUnknowns();
	pFilter->train(dataOrig);
	GMatrix* pData = pFilter->transformBatch(dataOrig);
	std::unique_ptr<GMatrix> hData(pData);

	// Convert to 3-column form
	GMatrix* pMatrix = new GMatrix(0, 3);
	std::unique_ptr<GMatrix> hMatrix(pMatrix);
	size_t dims = pData->cols();
	for(size_t i = 0; i < pData->rows(); i++)
	{
		GVec& row = pData->row(i);
		for(size_t j = 0; j < dims; j++)
		{
			if(row[j] != UNKNOWN_REAL_VALUE)
			{
				GVec& vec = pMatrix->newRow();
				vec[0] = (double)i;
				vec[1] = (double)j;
				vec[2] = row[j];
			}
		}
	}

	// Train the collaborative filter
	pModel->train(*pMatrix);
	hMatrix.release();
	pMatrix = NULL;

	// Predict values for missing elements
	for(size_t i = 0; i < pData->rows(); i++)
	{
		GVec& row = pData->row(i);
		for(size_t j = 0; j < dims; j++)
		{
			if(row[j] == UNKNOWN_REAL_VALUE)
				row[j] = pModel->predict(i, j);
			GAssert(row[j] != UNKNOWN_REAL_VALUE);
		}
	}

	// Convert the data back to its original form
	GMatrix* pOut = pFilter->untransformBatch(*pData);
	pOut->setRelation(hOrigRel.release());
	pOut->print(cout);
}
Example #13
0
void selfOrganizingMap(GArgReader& args){
  // Load the file
  GMatrix* pData = loadData(args.pop_string());
  Holder<GMatrix> hData(pData);

  // Parse arguments
  std::vector<double> netDims;
  unsigned numNodes = 1;
  while(args.next_is_uint()){
    unsigned dim = args.pop_uint();
    netDims.push_back(dim);
    numNodes *= dim;
  }
  if(netDims.size() < 1){
    throw Ex("No dimensions specified for self organizing map.  ",
	       "A map must be at least 1 dimensional.");
  }

  Holder<SOM::ReporterChain> reporters(new SOM::ReporterChain);
  Holder<SOM::TrainingAlgorithm> alg(NULL);
  Holder<GDistanceMetric> weightDist(new GRowDistance);
  Holder<GDistanceMetric> nodeDist(new GRowDistance);
  Holder<SOM::NodeLocationInitialization> topology(new SOM::GridTopology);
  Holder<SOM::NodeWeightInitialization> weightInit
    (new SOM::NodeWeightInitializationTrainingSetSample(NULL));
  Holder<SOM::NeighborhoodWindowFunction> 
    windowFunc(new SOM::GaussianWindowFunction());

  //Loading and saving
  string loadFrom = "";
  string saveTo = "";

  //Parameters for different training algorithms
  string algoName = "batch";
  double startWidth = -1;//Start width - set later if still negative
  double endWidth   = -1;//End width   - set later if still negative
  double startRate = -1;//Start learning rate
  double endRate   = -1;//End learning rate
  unsigned numIter     = 100;//Total iterations
  unsigned numConverge = 1;//#steps for batch to converge

  while(args.next_is_flag()){
    if(args.if_pop("-tofile")){
      saveTo = args.pop_string();
    }else if(args.if_pop("-fromfile")){
      loadFrom = args.pop_string();
    }else if(args.if_pop("-seed")){
      GRand::global().setSeed(args.pop_uint());
    }else if(args.if_pop("-neighborhood")){
      string name = args.pop_string();
      if(name == "gaussian"){
	windowFunc.reset(new SOM::GaussianWindowFunction());
      }else if(name == "uniform"){
	windowFunc.reset(new SOM::UniformWindowFunction());
      }else{
	throw Ex("Only gaussian and uniform are acceptible ",
		   "neighborhood types");
      }
    }else if(args.if_pop("-printMeshEvery")){
      using namespace SOM;
      unsigned interval = args.pop_uint();
      string baseFilename = args.pop_string();
      unsigned xDim = args.pop_uint();
      unsigned yDim = args.pop_uint();
      bool showTrain = false;
      if(args.if_pop("showTrain") || args.if_pop("showtrain")){
	showTrain = true;
      }
      smart_ptr<Reporter> weightReporter
	(new SVG2DWeightReporter(baseFilename, xDim, yDim, showTrain));
      Holder<IterationIntervalReporter> intervalReporter
	(new IterationIntervalReporter(weightReporter, interval));
      reporters->add(intervalReporter.release());
    }else if(args.if_pop("-batchTrain")){
      algoName = "batch";
      startWidth = args.pop_double();
      endWidth = args.pop_double();
      numIter = args.pop_uint();
      numConverge = args.pop_uint();
    }else if(args.if_pop("-stdTrain")){
      algoName = "standard";
      startWidth = args.pop_double();
      endWidth = args.pop_double();
      startRate = args.pop_double();
      endRate = args.pop_double();
      numIter = args.pop_uint();
    }else{
      throw Ex("Invalid option: ", args.peek());
    }
  }

  //Create the training algorithm
  Holder<SOM::TrainingAlgorithm> algo;
  if(algoName == "batch"){
    double netRadius = *std::max_element(netDims.begin(), netDims.end());
    if(startWidth < 0){ startWidth = 2*netRadius; }
    if(endWidth < 0){ endWidth = 1; }
    algo.reset( new SOM::BatchTraining
      (startWidth, endWidth, numIter, numConverge,
       weightInit.release(), windowFunc.release(),
       reporters.release()));
  }else if(algoName == "standard"){
    algo.reset( new SOM::TraditionalTraining
      (startWidth, endWidth, startRate, endRate, numIter,
       weightInit.release(), windowFunc.release(),
       reporters.release()));
  }else{
    throw Ex("Unknown type of training algorithm: \"",
	       algoName, "\"");
  }

  //Create the network & transform the data
  Holder<GSelfOrganizingMap> som;
  Holder<GMatrix> out;
  
  if(loadFrom == ""){
    //Create map from arguments given
    som.reset(new GSelfOrganizingMap
      (netDims, numNodes, topology.release(), algo.release(), 
       weightDist.release(), nodeDist.release()));
    //Train the network and transform the data in place
    out.reset(som->doit(*pData));
  }else{
    //Create map from file
    GDom source;
    source.loadJson(loadFrom.c_str());
    som.reset(new GSelfOrganizingMap(source.root()));
    //Transform using the loaded network
    out.reset(som->transformBatch(*pData));
  }

  //Save the trained network
  if(saveTo != ""){
    GDom serialized;
    GDomNode* root = som->serialize(&serialized);
    serialized.setRoot(root);
    serialized.saveJson(saveTo.c_str());
  }

  //Print the result
  out->print(cout);
}
Example #14
0
void principalComponentAnalysis(GArgReader& args)
{
	// Load the file
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	int nTargetDims = args.pop_uint();

	// Parse options
	string roundTrip;
	unsigned int seed = getpid() * (unsigned int)time(NULL);
	string eigenvalues;
	string components;
	string modelIn;
	string modelOut;
	bool aboutOrigin = false;
	while(args.next_is_flag())
	{
		if(args.if_pop("-seed"))
			seed = args.pop_uint();
		else if(args.if_pop("-roundtrip"))
			roundTrip = args.pop_string();
		else if(args.if_pop("-eigenvalues"))
			eigenvalues = args.pop_string();
		else if(args.if_pop("-components"))
			components = args.pop_string();
		else if(args.if_pop("-aboutorigin"))
			aboutOrigin = true;
		else if(args.if_pop("-modelin"))
			modelIn = args.pop_string();
		else if(args.if_pop("-modelout"))
			modelOut = args.pop_string();
		else
			throw Ex("Invalid option: ", args.peek());
	}

	// Transform the data
	GRand prng(seed);
	GPCA* pTransform = NULL;
	if(modelIn.length() > 0)
	{
		GDom doc;
		doc.loadJson(modelIn.c_str());
		GLearnerLoader ll(prng);
		pTransform = new GPCA(doc.root(), ll);
	}
	else
	{
		pTransform = new GPCA(nTargetDims, &prng);
		if(aboutOrigin)
			pTransform->aboutOrigin();
		if(eigenvalues.length() > 0)
			pTransform->computeEigVals();
		pTransform->train(*pData);
	}
	Holder<GPCA> hTransform(pTransform);

	GMatrix* pDataAfter = pTransform->transformBatch(*pData);
	Holder<GMatrix> hDataAfter(pDataAfter);

	// Save the eigenvalues
	if(eigenvalues.length() > 0)
	{
		GArffRelation* pRelation = new GArffRelation();
		pRelation->addAttribute("eigenvalues", 0, NULL);
		sp_relation pRel = pRelation;
		GMatrix dataEigenvalues(pRel);
		dataEigenvalues.newRows(nTargetDims);
		double* pEigVals = pTransform->eigVals();
		for(int i = 0; i < nTargetDims; i++)
			dataEigenvalues[i][0] = pEigVals[i];
		dataEigenvalues.saveArff(eigenvalues.c_str());
	}

	// Save the components
	if(components.length() > 0)
		pTransform->components()->saveArff(components.c_str());

	// Do the round-trip
	if(roundTrip.size() > 0)
	{
		GMatrix roundTripped(pData->rows(), pData->cols());
		for(size_t i = 0; i < pData->rows(); i++)
			pTransform->untransform(pDataAfter->row(i), roundTripped.row(i));
		roundTripped.saveArff(roundTrip.c_str());
	}

	if(modelOut.length() > 0)
	{
		GDom doc;
		doc.setRoot(pTransform->serialize(&doc));
		doc.saveJson(modelOut.c_str());
	}

	pDataAfter->print(cout);
}
Example #15
0
void neuroPCA(GArgReader& args)
{
	// Load the file
	GMatrix* pData = loadData(args.pop_string());
	Holder<GMatrix> hData(pData);
	int nTargetDims = args.pop_uint();

	// Parse options
	string roundTrip;
	unsigned int seed = getpid() * (unsigned int)time(NULL);
	bool trainBias = true;
	bool linear = false;
	string eigenvalues = "";
	while(args.next_is_flag())
	{
		if(args.if_pop("-seed"))
			seed = args.pop_uint();
		else if(args.if_pop("-clampbias"))
			trainBias = false;
		else if(args.if_pop("-linear"))
			linear = true;
		else if(args.if_pop("-eigenvalues"))
			eigenvalues = args.pop_string();
		else
			throw Ex("Invalid option: ", args.peek());
	}

	// Transform the data
	GRand prng(seed);
	GNeuroPCA transform(nTargetDims, &prng);
	if(!trainBias)
		transform.clampBias();
	if(linear)
		transform.setActivation(new GActivationIdentity());
	if(eigenvalues.length() > 0)
		transform.computeEigVals();
	GMatrix* pDataAfter = transform.doit(*pData);
	Holder<GMatrix> hDataAfter(pDataAfter);

	// Save the eigenvalues
	if(eigenvalues.length() > 0)
	{
		GArffRelation* pRelation = new GArffRelation();
		pRelation->addAttribute("eigenvalues", 0, NULL);
		sp_relation pRel = pRelation;
		GMatrix dataEigenvalues(pRel);
		dataEigenvalues.newRows(nTargetDims);
		double* pEigVals = transform.eigVals();
		for(int i = 0; i < nTargetDims; i++)
			dataEigenvalues[i][0] = pEigVals[i];
		dataEigenvalues.saveArff(eigenvalues.c_str());
	}

	// In linear mode, people usually expect normalized eigenvectors, so let's normalize them now
	if(linear)
	{
		GMatrix* pWeights = transform.weights();
		GAssert(pWeights->cols() == pData->cols());
		for(int i = 0; i < nTargetDims; i++)
		{
			double scal = sqrt(GVec::squaredMagnitude(pWeights->row(i + 1), pWeights->cols()));
			for(size_t j = 0; j < pDataAfter->rows(); j++)
				pDataAfter->row(j)[i] *= scal;
		}
	}

	pDataAfter->print(cout);
}
Example #16
0
void fillMissingValues(GArgReader& args)
{
	unsigned int seed = getpid() * (unsigned int)time(NULL);
	while(args.next_is_flag())
	{
		if(args.if_pop("-seed"))
			seed = args.pop_uint();
		else
			ThrowError("Invalid option: ", args.peek());
	}

	// Load the data and the filter
	GMatrix* pDataOrig = GMatrix::loadArff(args.pop_string());
	Holder<GMatrix> hDataOrig(pDataOrig);
	sp_relation pOrigRel = pDataOrig->relation();
	GRand prng(seed);
	GCollaborativeFilter* pModel = InstantiateAlgorithm(prng, args);
	Holder<GCollaborativeFilter> hModel(pModel);
	if(args.size() > 0)
		ThrowError("Superfluous argument: ", args.peek());

	// Convert to all normalized real values
	GNominalToCat* pNtc = new GNominalToCat();
	GTwoWayTransformChainer filter(new GNormalize(), pNtc);
	pNtc->preserveUnknowns();
	filter.train(*pDataOrig);
	GMatrix* pData = filter.transformBatch(*pDataOrig);
	Holder<GMatrix> hData(pData);
	hDataOrig.release();
	pDataOrig = NULL;

	// Convert to 3-column form
	GMatrix* pMatrix = new GMatrix(0, 3);
	Holder<GMatrix> hMatrix(pMatrix);
	size_t dims = pData->cols();
	for(size_t i = 0; i < pData->rows(); i++)
	{
		double* pRow = pData->row(i);
		for(size_t j = 0; j < dims; j++)
		{
			if(*pRow != UNKNOWN_REAL_VALUE)
			{
				double* pVec = pMatrix->newRow();
				pVec[0] = i;
				pVec[1] = j;
				pVec[2] = *pRow;
			}
			pRow++;
		}
	}

	// Train the collaborative filter
	pModel->train(*pMatrix);
	hMatrix.release();
	pMatrix = NULL;

	// Predict values for missing elements
	for(size_t i = 0; i < pData->rows(); i++)
	{
		double* pRow = pData->row(i);
		for(size_t j = 0; j < dims; j++)
		{
			if(*pRow == UNKNOWN_REAL_VALUE)
				*pRow = pModel->predict(i, j);
			GAssert(*pRow != UNKNOWN_REAL_VALUE);
			pRow++;
		}
	}

	// Convert the data back to its original form
	GMatrix* pOut = filter.untransformBatch(*pData);
	pOut->setRelation(pOrigRel);
	pOut->print(cout);
}
Example #17
0
///Return a pointer to newly allocated data read from the command line
///represented by args.
///
///The returned matrix is allocated by new and it is the caller's
///responsibility to deallocate it. The suggested manner is to use a
///Holder<GMatrix*>
///
///In the returned matrix, all of the attributes designated as labels
///have been moved to the end and ignored attributes have been
///removed. The original indices of all the attributes are returned in
///originalIndices.
///
///\param args the command-line arguments
///
///\param pLabelDims (out parameter) the index of the first attribute
///which is designated a label.
///
///\param originalIndices the vector in which to place the original
///indices.  originalIndices[i] is the index in the original data file
///of the attribute currently at index i.
void loadDataWithSwitches(GMatrix& data, GArgReader& args, size_t& pLabelDims,
			      std::vector<size_t>& originalIndices)
{
	// Load the dataset by extension
	const char* szFilename = args.pop_string();
	PathData pd;
	GFile::parsePath(szFilename, &pd);
	if(_stricmp(szFilename + pd.extStart, ".arff") == 0)
		data.loadArff(szFilename);
	else if(_stricmp(szFilename + pd.extStart, ".csv") == 0)
		data.loadCsv(szFilename, ',', false, false);
	else if(_stricmp(szFilename + pd.extStart, ".dat") == 0)
		data.loadCsv(szFilename, '\0', false, false);
	else
		throw Ex("Unsupported file format: ", szFilename + pd.extStart);

	//Make the initial list of original indices
	originalIndices.resize(data.cols());
	for(std::size_t i = 0; i < originalIndices.size(); ++i){
	  originalIndices.at(i) = i;
	}

	// Parse params
	vector<size_t> ignore;
	vector<size_t> labels;
	while(args.next_is_flag())
	{
		if(args.if_pop("-labels"))
			parseAttributeList(labels, args, data.cols());
		else if(args.if_pop("-ignore"))
			parseAttributeList(ignore, args, data.cols());
		else
			break;
	}

	// Throw out the ignored attributes
	std::sort(ignore.begin(), ignore.end());
	for(size_t i = ignore.size() - 1; i < ignore.size(); i--)
	{
		data.deleteColumn(ignore[i]);
		originalIndices.erase(originalIndices.begin()+ignore[i]);
		for(size_t j = 0; j < labels.size(); j++)
		{
			if(labels[j] >= ignore[i])
			{
				if(labels[j] == ignore[i])
					throw Ex("Attribute ", to_str(labels[j]), " is both ignored and used as a label");
				labels[j]--;
			}
		}
	}

	// Swap label columns to the end
	pLabelDims = std::max((size_t)1, labels.size());
	for(size_t i = 0; i < labels.size(); i++)
	{
		size_t src = labels[i];
		size_t dst = data.cols() - pLabelDims + i;
		if(src != dst)
		{
			data.swapColumns(src, dst);
			std::swap(originalIndices.at(src),
				  originalIndices.at(dst));
			for(size_t j = i + 1; j < labels.size(); j++)
			{
				if(labels[j] == dst)
				{
					labels[j] = src;
					break;
				}
			}
		}
	}
}