void aggregateCols(GArgReader& args) { size_t c = args.pop_uint(); vector<string> files; GFile::fileList(files); GMatrix* pResults = NULL; Holder<GMatrix> hResults; size_t i = 0; for(vector<string>::iterator it = files.begin(); it != files.end(); it++) { PathData pd; GFile::parsePath(it->c_str(), &pd); if(strcmp(it->c_str() + pd.extStart, ".arff") != 0) continue; GMatrix* pData = loadData(it->c_str()); Holder<GMatrix> hData(pData); if(!pResults) { pResults = new GMatrix(pData->rows(), files.size()); hResults.reset(pResults); } pResults->copyColumns(i, pData, c, 1); i++; } pResults->print(cout); }
// virtual void GWag::trainInner(GMatrix& features, GMatrix& labels) { GNeuralNet* pTemp = NULL; Holder<GNeuralNet> hTemp; size_t weights = 0; double* pWeightBuf = NULL; double* pWeightBuf2 = NULL; ArrayHolder<double> hWeightBuf; for(size_t i = 0; i < m_models; i++) { m_pNN->train(features, labels); if(pTemp) { // Average m_pNN with pTemp if(!m_noAlign) m_pNN->align(*pTemp); pTemp->weights(pWeightBuf); m_pNN->weights(pWeightBuf2); GVec::multiply(pWeightBuf, double(i) / (i + 1), weights); GVec::addScaled(pWeightBuf, 1.0 / (i + 1), pWeightBuf2, weights); pTemp->setWeights(pWeightBuf); } else { // Copy the m_pNN GDom doc; GDomNode* pNode = m_pNN->serialize(&doc); GLearnerLoader ll(m_rand); pTemp = new GNeuralNet(pNode, ll); hTemp.reset(pTemp); weights = pTemp->countWeights(); pWeightBuf = new double[2 * weights]; hWeightBuf.reset(pWeightBuf); pWeightBuf2 = pWeightBuf + weights; } } pTemp->weights(pWeightBuf); m_pNN->setWeights(pWeightBuf); }
void aggregateRows(GArgReader& args) { size_t r = args.pop_uint(); vector<string> files; GFile::fileList(files); GMatrix* pResults = NULL; Holder<GMatrix> hResults; for(vector<string>::iterator it = files.begin(); it != files.end(); it++) { PathData pd; GFile::parsePath(it->c_str(), &pd); if(strcmp(it->c_str() + pd.extStart, ".arff") != 0) continue; GMatrix* pData = loadData(it->c_str()); Holder<GMatrix> hData(pData); if(!pResults) { pResults = new GMatrix(pData->relation()); hResults.reset(pResults); } pResults->takeRow(pData->releaseRow(r)); } pResults->print(cout); }
// virtual void GAgglomerativeClusterer::cluster(const GMatrix* pData) { // Init the metric if(!m_pMetric) setMetric(new GRowDistance(), true); m_pMetric->init(&pData->relation(), false); // Find enough neighbors to form a connected graph GNeighborGraph* pNF = NULL; Holder<GNeighborGraph> hNF; size_t neighbors = 6; while(true) { GKdTree* pKdTree = new GKdTree(pData, neighbors, m_pMetric, false); pNF = new GNeighborGraph(pKdTree, true); hNF.reset(pNF); pNF->fillCache(); if(pNF->isConnected()) break; if(neighbors + 1 >= pData->rows()) throw Ex("internal problem--a graph with so many neighbors must be connected"); neighbors = std::min((neighbors * 3) / 2, pData->rows() - 1); } // Sort all the neighbors by their distances size_t count = pData->rows() * neighbors; vector< std::pair<double,size_t> > distNeighs; distNeighs.resize(count); double* pDistances = pNF->squaredDistanceTable(); size_t* pRows = pNF->cache(); size_t index = 0; vector< std::pair<double,size_t> >::iterator it = distNeighs.begin(); for(size_t i = 0; i < count; i++) { if(*pRows < pData->rows()) { it->first = *pDistances; it->second = i; it++; } else index--; pRows++; pDistances++; } std::sort(distNeighs.begin(), it); // Assign each row to its own cluster delete[] m_pClusters; m_pClusters = new size_t[pData->rows()]; // specifies which cluster each row belongs to GIndexVec::makeIndexVec(m_pClusters, pData->rows()); size_t* pSiblings = new size_t[pData->rows()]; // a cyclical linked list of each row in the cluster ArrayHolder<size_t> hSiblings(pSiblings); GIndexVec::makeIndexVec(pSiblings, pData->rows()); // init such that each row is in a cluster of 1 size_t currentClusterCount = pData->rows(); if(currentClusterCount <= m_clusterCount) return; // nothing to do // Merge until we have the desired number of clusters pRows = pNF->cache(); for(vector< std::pair<double,size_t> >::iterator dn = distNeighs.begin(); dn != it; dn++) { // Get the next two closest points size_t a = dn->second / neighbors; size_t b = pRows[dn->second]; GAssert(a != b && a < pData->rows() && b < pData->rows()); size_t clustA = m_pClusters[a]; size_t clustB = m_pClusters[b]; // Merge the clusters if(clustA == clustB) continue; // The two points are already in the same cluster if(clustB < clustA) // Make sure clustA has the smaller value { std::swap(a, b); std::swap(clustA, clustB); } for(size_t i = pSiblings[b]; true; i = pSiblings[i]) // Convert every row in clustB to clustA { m_pClusters[i] = clustA; if(i == b) break; } std::swap(pSiblings[a], pSiblings[b]); // This line joins the cyclical linked lists into one big cycle if(clustB < m_clusterCount) // Ensure that the first m_clusterCount cluster numbers are always in use { for(size_t i = 0; i < pData->rows(); i++) // rename another cluster to take the spot of clustB { if(m_pClusters[i] >= m_clusterCount) { for(size_t j = pSiblings[i]; true; j = pSiblings[j]) { m_pClusters[j] = clustB; if(j == i) break; } break; } } } if(--currentClusterCount <= m_clusterCount) return; } throw Ex("internal error--should have found the desired number of clusters before now"); }
void selfOrganizingMap(GArgReader& args){ // Load the file GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); // Parse arguments std::vector<double> netDims; unsigned numNodes = 1; while(args.next_is_uint()){ unsigned dim = args.pop_uint(); netDims.push_back(dim); numNodes *= dim; } if(netDims.size() < 1){ throw Ex("No dimensions specified for self organizing map. ", "A map must be at least 1 dimensional."); } Holder<SOM::ReporterChain> reporters(new SOM::ReporterChain); Holder<SOM::TrainingAlgorithm> alg(NULL); Holder<GDistanceMetric> weightDist(new GRowDistance); Holder<GDistanceMetric> nodeDist(new GRowDistance); Holder<SOM::NodeLocationInitialization> topology(new SOM::GridTopology); Holder<SOM::NodeWeightInitialization> weightInit (new SOM::NodeWeightInitializationTrainingSetSample(NULL)); Holder<SOM::NeighborhoodWindowFunction> windowFunc(new SOM::GaussianWindowFunction()); //Loading and saving string loadFrom = ""; string saveTo = ""; //Parameters for different training algorithms string algoName = "batch"; double startWidth = -1;//Start width - set later if still negative double endWidth = -1;//End width - set later if still negative double startRate = -1;//Start learning rate double endRate = -1;//End learning rate unsigned numIter = 100;//Total iterations unsigned numConverge = 1;//#steps for batch to converge while(args.next_is_flag()){ if(args.if_pop("-tofile")){ saveTo = args.pop_string(); }else if(args.if_pop("-fromfile")){ loadFrom = args.pop_string(); }else if(args.if_pop("-seed")){ GRand::global().setSeed(args.pop_uint()); }else if(args.if_pop("-neighborhood")){ string name = args.pop_string(); if(name == "gaussian"){ windowFunc.reset(new SOM::GaussianWindowFunction()); }else if(name == "uniform"){ windowFunc.reset(new SOM::UniformWindowFunction()); }else{ throw Ex("Only gaussian and uniform are acceptible ", "neighborhood types"); } }else if(args.if_pop("-printMeshEvery")){ using namespace SOM; unsigned interval = args.pop_uint(); string baseFilename = args.pop_string(); unsigned xDim = args.pop_uint(); unsigned yDim = args.pop_uint(); bool showTrain = false; if(args.if_pop("showTrain") || args.if_pop("showtrain")){ showTrain = true; } smart_ptr<Reporter> weightReporter (new SVG2DWeightReporter(baseFilename, xDim, yDim, showTrain)); Holder<IterationIntervalReporter> intervalReporter (new IterationIntervalReporter(weightReporter, interval)); reporters->add(intervalReporter.release()); }else if(args.if_pop("-batchTrain")){ algoName = "batch"; startWidth = args.pop_double(); endWidth = args.pop_double(); numIter = args.pop_uint(); numConverge = args.pop_uint(); }else if(args.if_pop("-stdTrain")){ algoName = "standard"; startWidth = args.pop_double(); endWidth = args.pop_double(); startRate = args.pop_double(); endRate = args.pop_double(); numIter = args.pop_uint(); }else{ throw Ex("Invalid option: ", args.peek()); } } //Create the training algorithm Holder<SOM::TrainingAlgorithm> algo; if(algoName == "batch"){ double netRadius = *std::max_element(netDims.begin(), netDims.end()); if(startWidth < 0){ startWidth = 2*netRadius; } if(endWidth < 0){ endWidth = 1; } algo.reset( new SOM::BatchTraining (startWidth, endWidth, numIter, numConverge, weightInit.release(), windowFunc.release(), reporters.release())); }else if(algoName == "standard"){ algo.reset( new SOM::TraditionalTraining (startWidth, endWidth, startRate, endRate, numIter, weightInit.release(), windowFunc.release(), reporters.release())); }else{ throw Ex("Unknown type of training algorithm: \"", algoName, "\""); } //Create the network & transform the data Holder<GSelfOrganizingMap> som; Holder<GMatrix> out; if(loadFrom == ""){ //Create map from arguments given som.reset(new GSelfOrganizingMap (netDims, numNodes, topology.release(), algo.release(), weightDist.release(), nodeDist.release())); //Train the network and transform the data in place out.reset(som->doit(*pData)); }else{ //Create map from file GDom source; source.loadJson(loadFrom.c_str()); som.reset(new GSelfOrganizingMap(source.root())); //Transform using the loaded network out.reset(som->transformBatch(*pData)); } //Save the trained network if(saveTo != ""){ GDom serialized; GDomNode* root = som->serialize(&serialized); serialized.setRoot(root); serialized.saveJson(saveTo.c_str()); } //Print the result out->print(cout); }