Esempio n. 1
0
void aggregateCols(GArgReader& args)
{
	size_t c = args.pop_uint();
	vector<string> files;
	GFile::fileList(files);
	GMatrix* pResults = NULL;
	Holder<GMatrix> hResults;
	size_t i = 0;
	for(vector<string>::iterator it = files.begin(); it != files.end(); it++)
	{
		PathData pd;
		GFile::parsePath(it->c_str(), &pd);
		if(strcmp(it->c_str() + pd.extStart, ".arff") != 0)
			continue;
		GMatrix* pData = loadData(it->c_str());
		Holder<GMatrix> hData(pData);
		if(!pResults)
		{
			pResults = new GMatrix(pData->rows(), files.size());
			hResults.reset(pResults);
		}
		pResults->copyColumns(i, pData, c, 1);
		i++;
	}
	pResults->print(cout);
}
Esempio n. 2
0
// virtual
void GWag::trainInner(GMatrix& features, GMatrix& labels)
{
	GNeuralNet* pTemp = NULL;
	Holder<GNeuralNet> hTemp;
	size_t weights = 0;
	double* pWeightBuf = NULL;
	double* pWeightBuf2 = NULL;
	ArrayHolder<double> hWeightBuf;
	for(size_t i = 0; i < m_models; i++)
	{
		m_pNN->train(features, labels);
		if(pTemp)
		{
			// Average m_pNN with pTemp
			if(!m_noAlign)
				m_pNN->align(*pTemp);
			pTemp->weights(pWeightBuf);
			m_pNN->weights(pWeightBuf2);
			GVec::multiply(pWeightBuf, double(i) / (i + 1), weights);
			GVec::addScaled(pWeightBuf, 1.0 / (i + 1), pWeightBuf2, weights);
			pTemp->setWeights(pWeightBuf);
		}
		else
		{
			// Copy the m_pNN
			GDom doc;
			GDomNode* pNode = m_pNN->serialize(&doc);
			GLearnerLoader ll(m_rand);
			pTemp = new GNeuralNet(pNode, ll);
			hTemp.reset(pTemp);
			weights = pTemp->countWeights();
			pWeightBuf = new double[2 * weights];
			hWeightBuf.reset(pWeightBuf);
			pWeightBuf2 = pWeightBuf + weights;
		}
	}
	pTemp->weights(pWeightBuf);
	m_pNN->setWeights(pWeightBuf);
}
Esempio n. 3
0
void aggregateRows(GArgReader& args)
{
	size_t r = args.pop_uint();
	vector<string> files;
	GFile::fileList(files);
	GMatrix* pResults = NULL;
	Holder<GMatrix> hResults;
	for(vector<string>::iterator it = files.begin(); it != files.end(); it++)
	{
		PathData pd;
		GFile::parsePath(it->c_str(), &pd);
		if(strcmp(it->c_str() + pd.extStart, ".arff") != 0)
			continue;
		GMatrix* pData = loadData(it->c_str());
		Holder<GMatrix> hData(pData);
		if(!pResults)
		{
			pResults = new GMatrix(pData->relation());
			hResults.reset(pResults);
		}
		pResults->takeRow(pData->releaseRow(r));
	}
	pResults->print(cout);
}
Esempio n. 4
0
// virtual
void GAgglomerativeClusterer::cluster(const GMatrix* pData)
{
	// Init the metric
	if(!m_pMetric)
		setMetric(new GRowDistance(), true);
	m_pMetric->init(&pData->relation(), false);

	// Find enough neighbors to form a connected graph
	GNeighborGraph* pNF = NULL;
	Holder<GNeighborGraph> hNF;
	size_t neighbors = 6;
	while(true)
	{
		GKdTree* pKdTree = new GKdTree(pData, neighbors, m_pMetric, false);
		pNF = new GNeighborGraph(pKdTree, true);
		hNF.reset(pNF);
		pNF->fillCache();
		if(pNF->isConnected())
			break;
		if(neighbors + 1 >= pData->rows())
			throw Ex("internal problem--a graph with so many neighbors must be connected");
		neighbors = std::min((neighbors * 3) / 2, pData->rows() - 1);
	}

	// Sort all the neighbors by their distances
	size_t count = pData->rows() * neighbors;
	vector< std::pair<double,size_t> > distNeighs;
	distNeighs.resize(count);
	double* pDistances = pNF->squaredDistanceTable();
	size_t* pRows = pNF->cache();
	size_t index = 0;
	vector< std::pair<double,size_t> >::iterator it = distNeighs.begin();
	for(size_t i = 0; i < count; i++)
	{
		if(*pRows < pData->rows())
		{
			it->first = *pDistances;
			it->second = i;
			it++;
		}
		else
			index--;
		pRows++;
		pDistances++;
	}
	std::sort(distNeighs.begin(), it);

	// Assign each row to its own cluster
	delete[] m_pClusters;
	m_pClusters = new size_t[pData->rows()]; // specifies which cluster each row belongs to
	GIndexVec::makeIndexVec(m_pClusters, pData->rows());
	size_t* pSiblings = new size_t[pData->rows()]; // a cyclical linked list of each row in the cluster
	ArrayHolder<size_t> hSiblings(pSiblings);
	GIndexVec::makeIndexVec(pSiblings, pData->rows()); // init such that each row is in a cluster of 1
	size_t currentClusterCount = pData->rows();
	if(currentClusterCount <= m_clusterCount)
		return; // nothing to do

	// Merge until we have the desired number of clusters
	pRows = pNF->cache();
	for(vector< std::pair<double,size_t> >::iterator dn = distNeighs.begin(); dn != it; dn++)
	{
		// Get the next two closest points
		size_t a = dn->second / neighbors;
		size_t b = pRows[dn->second];
		GAssert(a != b && a < pData->rows() && b < pData->rows());
		size_t clustA = m_pClusters[a];
		size_t clustB = m_pClusters[b];

		// Merge the clusters
		if(clustA == clustB)
			continue; // The two points are already in the same cluster
		if(clustB < clustA) // Make sure clustA has the smaller value
		{
			std::swap(a, b);
			std::swap(clustA, clustB);
		}
		for(size_t i = pSiblings[b]; true; i = pSiblings[i]) // Convert every row in clustB to clustA
		{
			m_pClusters[i] = clustA;
			if(i == b)
				break;
		}
		std::swap(pSiblings[a], pSiblings[b]); // This line joins the cyclical linked lists into one big cycle
		if(clustB < m_clusterCount) // Ensure that the first m_clusterCount cluster numbers are always in use
		{
			for(size_t i = 0; i < pData->rows(); i++) // rename another cluster to take the spot of clustB
			{
				if(m_pClusters[i] >= m_clusterCount)
				{
					for(size_t j = pSiblings[i]; true; j = pSiblings[j])
					{
						m_pClusters[j] = clustB;
						if(j == i)
							break;
					}
					break;
				}
			}
		}
		if(--currentClusterCount <= m_clusterCount)
			return;
	}
	throw Ex("internal error--should have found the desired number of clusters before now");
}
Esempio n. 5
0
void selfOrganizingMap(GArgReader& args){
  // Load the file
  GMatrix* pData = loadData(args.pop_string());
  Holder<GMatrix> hData(pData);

  // Parse arguments
  std::vector<double> netDims;
  unsigned numNodes = 1;
  while(args.next_is_uint()){
    unsigned dim = args.pop_uint();
    netDims.push_back(dim);
    numNodes *= dim;
  }
  if(netDims.size() < 1){
    throw Ex("No dimensions specified for self organizing map.  ",
	       "A map must be at least 1 dimensional.");
  }

  Holder<SOM::ReporterChain> reporters(new SOM::ReporterChain);
  Holder<SOM::TrainingAlgorithm> alg(NULL);
  Holder<GDistanceMetric> weightDist(new GRowDistance);
  Holder<GDistanceMetric> nodeDist(new GRowDistance);
  Holder<SOM::NodeLocationInitialization> topology(new SOM::GridTopology);
  Holder<SOM::NodeWeightInitialization> weightInit
    (new SOM::NodeWeightInitializationTrainingSetSample(NULL));
  Holder<SOM::NeighborhoodWindowFunction> 
    windowFunc(new SOM::GaussianWindowFunction());

  //Loading and saving
  string loadFrom = "";
  string saveTo = "";

  //Parameters for different training algorithms
  string algoName = "batch";
  double startWidth = -1;//Start width - set later if still negative
  double endWidth   = -1;//End width   - set later if still negative
  double startRate = -1;//Start learning rate
  double endRate   = -1;//End learning rate
  unsigned numIter     = 100;//Total iterations
  unsigned numConverge = 1;//#steps for batch to converge

  while(args.next_is_flag()){
    if(args.if_pop("-tofile")){
      saveTo = args.pop_string();
    }else if(args.if_pop("-fromfile")){
      loadFrom = args.pop_string();
    }else if(args.if_pop("-seed")){
      GRand::global().setSeed(args.pop_uint());
    }else if(args.if_pop("-neighborhood")){
      string name = args.pop_string();
      if(name == "gaussian"){
	windowFunc.reset(new SOM::GaussianWindowFunction());
      }else if(name == "uniform"){
	windowFunc.reset(new SOM::UniformWindowFunction());
      }else{
	throw Ex("Only gaussian and uniform are acceptible ",
		   "neighborhood types");
      }
    }else if(args.if_pop("-printMeshEvery")){
      using namespace SOM;
      unsigned interval = args.pop_uint();
      string baseFilename = args.pop_string();
      unsigned xDim = args.pop_uint();
      unsigned yDim = args.pop_uint();
      bool showTrain = false;
      if(args.if_pop("showTrain") || args.if_pop("showtrain")){
	showTrain = true;
      }
      smart_ptr<Reporter> weightReporter
	(new SVG2DWeightReporter(baseFilename, xDim, yDim, showTrain));
      Holder<IterationIntervalReporter> intervalReporter
	(new IterationIntervalReporter(weightReporter, interval));
      reporters->add(intervalReporter.release());
    }else if(args.if_pop("-batchTrain")){
      algoName = "batch";
      startWidth = args.pop_double();
      endWidth = args.pop_double();
      numIter = args.pop_uint();
      numConverge = args.pop_uint();
    }else if(args.if_pop("-stdTrain")){
      algoName = "standard";
      startWidth = args.pop_double();
      endWidth = args.pop_double();
      startRate = args.pop_double();
      endRate = args.pop_double();
      numIter = args.pop_uint();
    }else{
      throw Ex("Invalid option: ", args.peek());
    }
  }

  //Create the training algorithm
  Holder<SOM::TrainingAlgorithm> algo;
  if(algoName == "batch"){
    double netRadius = *std::max_element(netDims.begin(), netDims.end());
    if(startWidth < 0){ startWidth = 2*netRadius; }
    if(endWidth < 0){ endWidth = 1; }
    algo.reset( new SOM::BatchTraining
      (startWidth, endWidth, numIter, numConverge,
       weightInit.release(), windowFunc.release(),
       reporters.release()));
  }else if(algoName == "standard"){
    algo.reset( new SOM::TraditionalTraining
      (startWidth, endWidth, startRate, endRate, numIter,
       weightInit.release(), windowFunc.release(),
       reporters.release()));
  }else{
    throw Ex("Unknown type of training algorithm: \"",
	       algoName, "\"");
  }

  //Create the network & transform the data
  Holder<GSelfOrganizingMap> som;
  Holder<GMatrix> out;
  
  if(loadFrom == ""){
    //Create map from arguments given
    som.reset(new GSelfOrganizingMap
      (netDims, numNodes, topology.release(), algo.release(), 
       weightDist.release(), nodeDist.release()));
    //Train the network and transform the data in place
    out.reset(som->doit(*pData));
  }else{
    //Create map from file
    GDom source;
    source.loadJson(loadFrom.c_str());
    som.reset(new GSelfOrganizingMap(source.root()));
    //Transform using the loaded network
    out.reset(som->transformBatch(*pData));
  }

  //Save the trained network
  if(saveTo != ""){
    GDom serialized;
    GDomNode* root = som->serialize(&serialized);
    serialized.setRoot(root);
    serialized.saveJson(saveTo.c_str());
  }

  //Print the result
  out->print(cout);
}