void AddIndexAttribute(GArgReader& args) { // Parse args const char* filename = args.pop_string(); double nStartValue = 0.0; double nIncrement = 1.0; while(args.size() > 0) { if(args.if_pop("-start")) nStartValue = args.pop_double(); else if(args.if_pop("-increment")) nIncrement = args.pop_double(); else ThrowError("Invalid option: ", args.peek()); } GMatrix* pData = loadData(filename); Holder<GMatrix> hData(pData); GArffRelation* pIndexRelation = new GArffRelation(); pIndexRelation->addAttribute("index", 0, NULL); sp_relation pIndexRel = pIndexRelation; GMatrix indexes(pIndexRel); indexes.newRows(pData->rows()); for(size_t i = 0; i < pData->rows(); i++) indexes.row(i)[0] = nStartValue + i * nIncrement; GMatrix* pUnified = GMatrix::mergeHoriz(&indexes, pData); Holder<GMatrix> hUnified(pUnified); pUnified->print(cout); }
void threshold(GArgReader& args){ GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); unsigned column=args.pop_uint(); if(column >= hData->cols()){ std::stringstream msg; if(hData->cols() >= 1){ msg << "The column to threshold is too large. It should be in " << "the range [0.." << (hData->cols()-1) << "]."; }else{ msg << "This data has no columns to threshold."; } ThrowError(msg.str()); } if(hData->relation()->valueCount(column) != 0){ ThrowError("Can only use threshold on continuous attributes."); } double value = args.pop_double(); //Do the actual thresholding for(size_t i = 0; i < hData->rows(); ++i){ double& v = hData->row(i)[column]; if(v <= value){ v = 0; }else { v = 1; } } //Print the data hData->print(cout); }
void fuzzykmeans(GArgReader& args) { // Load the file and params GMatrix data; loadData(data, args.pop_string()); int clusters = args.pop_uint(); // Parse Options unsigned int nSeed = getpid() * (unsigned int)time(NULL); double fuzzifier = 1.3; size_t reps = 1; while(args.size() > 0) { if(args.if_pop("-seed")) nSeed = args.pop_uint(); else if(args.if_pop("-fuzzifier")) fuzzifier = args.pop_double(); else if(args.if_pop("-reps")) reps = args.pop_uint(); else throw Ex("Invalid option: ", args.peek()); } // Do the clustering GRand prng(nSeed); GFuzzyKMeans clusterer(clusters, &prng); clusterer.setFuzzifier(fuzzifier); clusterer.setReps(reps); GMatrix* pOut = clusterer.reduce(data); std::unique_ptr<GMatrix> hOut(pOut); pOut->print(cout); }
void wilcoxon(GArgReader& args) { size_t n = args.pop_uint(); double w = args.pop_double(); double p = GMath::wilcoxonPValue(n, w); cout << p << "\n"; }
void addNoise(GArgReader& args) { GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); double dev = args.pop_double(); // Parse the options unsigned int seed = getpid() * (unsigned int)time(NULL); int excludeLast = 0; while(args.next_is_flag()) { if(args.if_pop("-seed")) seed = args.pop_uint(); else if(args.if_pop("-excludelast")) excludeLast = args.pop_uint(); else ThrowError("Invalid neighbor finder option: ", args.peek()); } GRand prng(seed); size_t cols = pData->cols() - excludeLast; for(size_t r = 0; r < pData->rows(); r++) { double* pRow = pData->row(r); for(size_t c = 0; c < cols; c++) *(pRow++) += dev * prng.normal(); } pData->print(cout); }
void multiplyScalar(GArgReader& args) { GMatrix* pA = loadData(args.pop_string()); Holder<GMatrix> hA(pA); double scale = args.pop_double(); if(args.size() > 0) ThrowError("Superfluous arg: ", args.pop_string()); pA->multiply(scale); pA->print(cout); }
void ManifoldSculpting(GArgReader& args) { // Load the file and params GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); unsigned int nSeed = getpid() * (unsigned int)time(NULL); GRand prng(nSeed); GNeighborFinder* pNF = instantiateNeighborFinder(pData, &prng, args); Holder<GNeighborFinder> hNF(pNF); size_t targetDims = args.pop_uint(); // Parse Options const char* szPreprocessedData = NULL; double scaleRate = 0.999; while(args.size() > 0) { if(args.if_pop("-seed")) prng.setSeed(args.pop_uint()); else if(args.if_pop("-continue")) szPreprocessedData = args.pop_string(); else if(args.if_pop("-scalerate")) scaleRate = args.pop_double(); else throw Ex("Invalid option: ", args.peek()); } // Load the hint data GMatrix* pDataHint = NULL; Holder<GMatrix> hDataHint(NULL); if(szPreprocessedData) { pDataHint = loadData(szPreprocessedData); hDataHint.reset(pDataHint); if(pDataHint->relation()->size() != targetDims) throw Ex("Wrong number of dims in the hint data"); if(pDataHint->rows() != pData->rows()) throw Ex("Wrong number of patterns in the hint data"); } // Transform the data GManifoldSculpting transform(pNF->neighborCount(), targetDims, &prng); transform.setSquishingRate(scaleRate); if(pDataHint) transform.setPreprocessedData(hDataHint.release()); transform.setNeighborFinder(pNF); GMatrix* pDataAfter = transform.doit(*pData); Holder<GMatrix> hDataAfter(pDataAfter); pDataAfter->print(cout); }
void shiftColumns(GArgReader& args) { GMatrix* pA = loadData(args.pop_string()); Holder<GMatrix> hA(pA); vector<size_t> cols; parseAttributeList(cols, args, pA->cols()); double offset = args.pop_double(); for(size_t i = 0; i < pA->rows(); i++) { double* pRow = pA->row(i); for(vector<size_t>::iterator it = cols.begin(); it != cols.end(); it++) pRow[*it] += offset; } pA->print(cout); }
void dropRandomValues(GArgReader& args) { GMatrix* pData = loadData(args.pop_string()); double portion = args.pop_double(); // Parse the options unsigned int seed = getpid() * (unsigned int)time(NULL); while(args.next_is_flag()) { if(args.if_pop("-seed")) seed = args.pop_uint(); else ThrowError("Invalid option: ", args.peek()); } GRand rand(seed); size_t n = pData->rows() * pData->cols(); size_t k = size_t(portion * n); for(size_t i = 0; i < pData->cols(); i++) { size_t vals = pData->relation()->valueCount(i); if(vals == 0) { for(size_t j = 0; j < pData->rows(); j++) { if(rand.next(n) < k) { pData->row(j)[i] = UNKNOWN_REAL_VALUE; k--; } n--; } } else { for(size_t j = 0; j < pData->rows(); j++) { if(rand.next(n) < k) { pData->row(j)[i] = UNKNOWN_DISCRETE_VALUE; k--; } n--; } } } pData->print(cout); }
void rotate(GArgReader& args) { GMatrix* pA = loadData(args.pop_string()); Holder<GMatrix> hA(pA); sp_relation relation = pA->relation(); unsigned colx = args.pop_uint(); if(colx >= pA->cols()){ ThrowError("Rotation first column index (",to_str(colx),") " "should not be greater " "than the largest index, which is ", to_str(pA->cols()-1), "."); } if(!relation->areContinuous(colx,1)){ ThrowError("Rotation first column index (",to_str(colx),") " "should be continuous and it is not."); } unsigned coly = args.pop_uint(); if(coly >= pA->cols()){ ThrowError("Rotation second column index (",to_str(coly),") " "should not be greater " "than the largest index, which is ", to_str(pA->cols()-1), "."); } if(!relation->areContinuous(coly,1)){ ThrowError("Rotation second column index (",to_str(coly),") " "should be continuous and it is not."); } double angle = args.pop_double(); angle = angle * M_PI / 180; //Convert from degrees to radians double cosAngle = std::cos(angle); double sinAngle = std::sin(angle); for(std::size_t rowIdx = 0; rowIdx < pA->rows(); ++rowIdx){ double* row = (*pA)[rowIdx]; double x = row[colx]; double y = row[coly]; row[colx]=x*cosAngle-y*sinAngle; row[coly]=x*sinAngle+y*cosAngle; } pA->print(cout); }
void normalize(GArgReader& args) { GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); double min = 0.0; double max = 1.0; while(args.size() > 0) { if(args.if_pop("-range")) { min = args.pop_double(); max = args.pop_double(); } else ThrowError("Invalid option: ", args.peek()); } GNormalize transform(min, max); transform.train(*pData); GMatrix* pOut = transform.transformBatch(*pData); Holder<GMatrix> hOut(pOut); pOut->print(cout); }
void unsupervisedBackProp(GArgReader& args) { // Load the file and params GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); int targetDims = args.pop_uint(); // Parse Options unsigned int nSeed = getpid() * (unsigned int)time(NULL); GRand prng(nSeed); GUnsupervisedBackProp* pUBP = new GUnsupervisedBackProp(targetDims, &prng); Holder<GUnsupervisedBackProp> hUBP(pUBP); vector<size_t> paramRanges; string sModelOut; string sProgress; bool inputBias = true; while(args.size() > 0) { if(args.if_pop("-seed")) prng.setSeed(args.pop_uint()); else if(args.if_pop("-addlayer")) pUBP->neuralNet()->addLayer(args.pop_uint()); else if(args.if_pop("-params")) { if(pUBP->jitterer()) throw Ex("You can't change the params after you add an image jitterer"); size_t paramDims = args.pop_uint(); for(size_t i = 0; i < paramDims; i++) paramRanges.push_back(args.pop_uint()); } else if(args.if_pop("-modelin")) { GDom doc; doc.loadJson(args.pop_string()); GLearnerLoader ll(prng); pUBP = new GUnsupervisedBackProp(doc.root(), ll); hUBP.reset(pUBP); } else if(args.if_pop("-modelout")) sModelOut = args.pop_string(); else if(args.if_pop("-intrinsicin")) { GMatrix* pInt = new GMatrix(); pInt->loadArff(args.pop_string()); pUBP->setIntrinsic(pInt); } else if(args.if_pop("-jitter")) { if(paramRanges.size() != 2) throw Ex("The params must be set to 2 before a tweaker is set"); size_t channels = args.pop_uint(); double rot = args.pop_double(); double trans = args.pop_double(); double zoom = args.pop_double(); GImageJitterer* pJitterer = new GImageJitterer(paramRanges[0], paramRanges[1], channels, rot, trans, zoom); pUBP->setJitterer(pJitterer); } else if(args.if_pop("-noinputbias")) inputBias = false; else if(args.if_pop("-progress")) { sProgress = args.pop_string(); pUBP->trackProgress(); } else if(args.if_pop("-onepass")) pUBP->onePass(); else throw Ex("Invalid option: ", args.peek()); } pUBP->setParams(paramRanges); pUBP->setUseInputBias(inputBias); // Transform the data GMatrix* pDataAfter = pUBP->doit(*pData); Holder<GMatrix> hDataAfter(pDataAfter); pDataAfter->print(cout); // Save the model (if requested) if(sModelOut.length() > 0) { GDom doc; doc.setRoot(pUBP->serialize(&doc)); doc.saveJson(sModelOut.c_str()); } if(sProgress.length() > 0) pUBP->progress().saveArff(sProgress.c_str()); }
void Extrapolate(GArgReader &args) { // Load the model if(args.size() < 1) { throw Ex("Model not specified."); } GDom doc; doc.loadJson(args.pop_string()); GLearnerLoader ll(true); GSupervisedLearner *pLearner = ll.loadLearner(doc.root()); std::unique_ptr<GSupervisedLearner> hLearner(pLearner); // Parse options double start = 1.0; double length = 1.0; double step = 0.0002; bool useFeatures = false; bool outputFeatures = true; GNeuralDecomposition *nd = (GNeuralDecomposition *) pLearner; std::unique_ptr<GMatrix> hFeatures; while(args.next_is_flag()) { if(args.if_pop("-start")) { start = args.pop_double(); } else if(args.if_pop("-length")) { length = args.pop_double(); } else if(args.if_pop("-step")) { step = args.pop_double(); } else if(args.if_pop("-features")) { LoadData(args, hFeatures); useFeatures = true; } else if(args.if_pop("-outputFeatures")) { outputFeatures = true; } else { throw Ex("Invalid option: ", args.peek()); } } // Extrapolate GMatrix *pOutput; if(useFeatures) pOutput = nd->extrapolate(*hFeatures.get()); else pOutput = nd->extrapolate(start, length, step, outputFeatures); std::unique_ptr<GMatrix> hOutput(pOutput); // Output predictions pOutput->print(cout); }
void Train(GArgReader &args) { // Load series from file std::unique_ptr<GMatrix> hSeries, hFeatures; LoadData(args, hSeries); GMatrix *pSeries = hSeries.get(); // Split features/labels if(pSeries->cols() == 2) { GMatrix *pFeatures = pSeries->cloneSub(0, 0, pSeries->rows(), 1); GMatrix *pLabels = pSeries->cloneSub(0, 1, pSeries->rows(), 1); hFeatures.reset(pFeatures); hSeries.reset(pLabels); pSeries = pLabels; } else if(pSeries->cols() > 2) { throw Ex("Too many columns!"); } // Parse options GNeuralDecomposition *nd = new GNeuralDecomposition(); while(args.next_is_flag()) { if(args.if_pop("-regularization")) nd->setRegularization(args.pop_double()); else if(args.if_pop("-learningRate")) nd->setLearningRate(args.pop_double()); else if(args.if_pop("-linearUnits")) nd->setLinearUnits(args.pop_uint()); else if(args.if_pop("-softplusUnits")) nd->setSoftplusUnits(args.pop_uint()); else if(args.if_pop("-sigmoidUnits")) nd->setSigmoidUnits(args.pop_uint()); else if(args.if_pop("-epochs")) nd->setEpochs(args.pop_uint()); else if(args.if_pop("-features")) LoadData(args, hFeatures); else if(args.if_pop("-filterLogarithm")) nd->setFilterLogarithm(true); else throw Ex("Invalid option: ", args.peek()); } if(hFeatures.get() == NULL) { // Generate features GMatrix *pFeatures = new GMatrix(pSeries->rows(), 1); for(size_t i = 0; i < pSeries->rows(); i++) { pFeatures->row(i)[0] = i / (double) pSeries->rows(); } hFeatures.reset(pFeatures); } // Train GMatrix *pFeatures = hFeatures.get(); nd->train(*pFeatures, *pSeries); // Output the trained model GDom doc; doc.setRoot(nd->serialize(&doc)); doc.writeJson(cout); }
void selfOrganizingMap(GArgReader& args){ // Load the file GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); // Parse arguments std::vector<double> netDims; unsigned numNodes = 1; while(args.next_is_uint()){ unsigned dim = args.pop_uint(); netDims.push_back(dim); numNodes *= dim; } if(netDims.size() < 1){ throw Ex("No dimensions specified for self organizing map. ", "A map must be at least 1 dimensional."); } Holder<SOM::ReporterChain> reporters(new SOM::ReporterChain); Holder<SOM::TrainingAlgorithm> alg(NULL); Holder<GDistanceMetric> weightDist(new GRowDistance); Holder<GDistanceMetric> nodeDist(new GRowDistance); Holder<SOM::NodeLocationInitialization> topology(new SOM::GridTopology); Holder<SOM::NodeWeightInitialization> weightInit (new SOM::NodeWeightInitializationTrainingSetSample(NULL)); Holder<SOM::NeighborhoodWindowFunction> windowFunc(new SOM::GaussianWindowFunction()); //Loading and saving string loadFrom = ""; string saveTo = ""; //Parameters for different training algorithms string algoName = "batch"; double startWidth = -1;//Start width - set later if still negative double endWidth = -1;//End width - set later if still negative double startRate = -1;//Start learning rate double endRate = -1;//End learning rate unsigned numIter = 100;//Total iterations unsigned numConverge = 1;//#steps for batch to converge while(args.next_is_flag()){ if(args.if_pop("-tofile")){ saveTo = args.pop_string(); }else if(args.if_pop("-fromfile")){ loadFrom = args.pop_string(); }else if(args.if_pop("-seed")){ GRand::global().setSeed(args.pop_uint()); }else if(args.if_pop("-neighborhood")){ string name = args.pop_string(); if(name == "gaussian"){ windowFunc.reset(new SOM::GaussianWindowFunction()); }else if(name == "uniform"){ windowFunc.reset(new SOM::UniformWindowFunction()); }else{ throw Ex("Only gaussian and uniform are acceptible ", "neighborhood types"); } }else if(args.if_pop("-printMeshEvery")){ using namespace SOM; unsigned interval = args.pop_uint(); string baseFilename = args.pop_string(); unsigned xDim = args.pop_uint(); unsigned yDim = args.pop_uint(); bool showTrain = false; if(args.if_pop("showTrain") || args.if_pop("showtrain")){ showTrain = true; } smart_ptr<Reporter> weightReporter (new SVG2DWeightReporter(baseFilename, xDim, yDim, showTrain)); Holder<IterationIntervalReporter> intervalReporter (new IterationIntervalReporter(weightReporter, interval)); reporters->add(intervalReporter.release()); }else if(args.if_pop("-batchTrain")){ algoName = "batch"; startWidth = args.pop_double(); endWidth = args.pop_double(); numIter = args.pop_uint(); numConverge = args.pop_uint(); }else if(args.if_pop("-stdTrain")){ algoName = "standard"; startWidth = args.pop_double(); endWidth = args.pop_double(); startRate = args.pop_double(); endRate = args.pop_double(); numIter = args.pop_uint(); }else{ throw Ex("Invalid option: ", args.peek()); } } //Create the training algorithm Holder<SOM::TrainingAlgorithm> algo; if(algoName == "batch"){ double netRadius = *std::max_element(netDims.begin(), netDims.end()); if(startWidth < 0){ startWidth = 2*netRadius; } if(endWidth < 0){ endWidth = 1; } algo.reset( new SOM::BatchTraining (startWidth, endWidth, numIter, numConverge, weightInit.release(), windowFunc.release(), reporters.release())); }else if(algoName == "standard"){ algo.reset( new SOM::TraditionalTraining (startWidth, endWidth, startRate, endRate, numIter, weightInit.release(), windowFunc.release(), reporters.release())); }else{ throw Ex("Unknown type of training algorithm: \"", algoName, "\""); } //Create the network & transform the data Holder<GSelfOrganizingMap> som; Holder<GMatrix> out; if(loadFrom == ""){ //Create map from arguments given som.reset(new GSelfOrganizingMap (netDims, numNodes, topology.release(), algo.release(), weightDist.release(), nodeDist.release())); //Train the network and transform the data in place out.reset(som->doit(*pData)); }else{ //Create map from file GDom source; source.loadJson(loadFrom.c_str()); som.reset(new GSelfOrganizingMap(source.root())); //Transform using the loaded network out.reset(som->transformBatch(*pData)); } //Save the trained network if(saveTo != ""){ GDom serialized; GDomNode* root = som->serialize(&serialized); serialized.setRoot(root); serialized.saveJson(saveTo.c_str()); } //Print the result out->print(cout); }
void significance(GArgReader& args) { GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); int attr1 = args.pop_uint(); int attr2 = args.pop_uint(); // Parse options double tolerance = 0.001; while(args.size() > 0) { if(args.if_pop("-tol")) tolerance = args.pop_double(); else ThrowError("Invalid option: ", args.peek()); } // Print some basic stats cout.precision(8); { cout << "### Some basic stats\n"; cout << "Medians = " << pData->median(attr1) << ", " << pData->median(attr2) << "\n"; double mean1 = pData->mean(attr1); double mean2 = pData->mean(attr2); cout << "Means = " << mean1 << ", " << mean2 << "\n"; double var1 = pData->variance(attr1, mean1); double var2 = pData->variance(attr2, mean2); cout << "Standard deviations = " << sqrt(var1) << ", " << sqrt(var2) << "\n"; int less = 0; int eq = 0; int more = 0; for(size_t i = 0; i < pData->rows(); i++) { double* pRow = pData->row(i); if(std::abs(pRow[attr1] - pRow[attr2]) < tolerance) eq++; else if(pRow[attr1] < pRow[attr2]) less++; else more++; } cout << less << " less, " << eq << " same, " << more << " greater\n"; } // Perform the significance tests { cout << "\n### Paired T-test\n"; size_t v; double t; pData->pairedTTest(&v, &t, attr1, attr2, false); double p = GMath::tTestAlphaValue(v, t); cout << "v=" << v << ", t=" << t << ", p=" << p << "\n"; } { cout << "\n### Paired T-test with normalized values\n"; size_t v; double t; pData->pairedTTest(&v, &t, attr1, attr2, true); double p = GMath::tTestAlphaValue(v, t); cout << "v=" << v << ", t=" << t << ", p=" << p << "\n"; } { cout << "\n### Wilcoxon Signed Ranks Test"; int num; double wMinus, wPlus; pData->wilcoxonSignedRanksTest(attr1, attr2, tolerance, &num, &wMinus, &wPlus); cout << "Number of signed ranks: " << num << "\n"; double w_min = std::min(wMinus, wPlus); double w_sum = wPlus - wMinus; cout << "W- = " << wMinus << ", W+ = " << wPlus << ", W_min = " << w_min << ", W_sum = " << w_sum << "\n"; double p_min = 0.5 * GMath::wilcoxonPValue(num, w_min); if(num < 10) cout << "Because the number of signed ranks is small, you should use a lookup table, rather than rely on the normal approximation for the P-value.\n"; cout << "One-tailed P-value (for directional comparisons) computed with a normal approximation using W_min = " << 0.5 * p_min << "\n"; cout << "Two-tailed P-value (for non-directional comparisons) computed with a normal approximation using W_min = " << p_min << "\n"; cout << "To show that something is \"better\" than something else, use the one-tailed P-value.\n"; cout << "Commonly, a P-value less that 0.05 is considered to be significant.\n"; /* double p_sum = GMath::wilcoxonPValue(num, w_sum); cout << "Directional (one-tailed) P-value computed with W_sum = " << p_sum << "\n"; */ } }
void sampleRows(GArgReader& args) { const char* filename = args.pop_string(); double portion = args.pop_double(); if(portion < 0 || portion > 1) ThrowError("The portion must be between 0 and 1"); PathData pd; GFile::parsePath(filename, &pd); bool arff = false; if(_stricmp(filename + pd.extStart, ".arff") == 0) arff = true; // Parse Options unsigned int seed = getpid() * (unsigned int)time(NULL); while(args.size() > 0) { if(args.if_pop("-seed")) seed = args.pop_uint(); else ThrowError("Invalid option: ", args.peek()); } GRand rand(seed); size_t size = 0; std::ifstream s; s.exceptions(std::ios::failbit|std::ios::badbit); try { s.open(filename, std::ios::binary); s.seekg(0, std::ios::end); size = (size_t)s.tellg(); s.seekg(0, std::ios::beg); } catch(const std::exception&) { if(GFile::doesFileExist(filename)) ThrowError("Error while trying to open the existing file: ", filename); else ThrowError("File not found: ", filename); } char* pLine = new char[MAX_LINE_LENGTH]; ArrayHolder<char> hLine(pLine); size_t line = 1; while(size > 0) { s.getline(pLine, std::min(size + 1, size_t(MAX_LINE_LENGTH))); size_t linelen = std::min(size, size_t(s.gcount())); if(linelen >= MAX_LINE_LENGTH - 1) ThrowError("Line ", to_str(line), " is too long"); // todo: just resize the buffer here if(arff) { if(_strnicmp(pLine, "@DATA", 5) == 0) arff = false; cout << pLine << "\n"; } else if(rand.uniform() < portion) cout << pLine << "\n"; size -= linelen; line++; } }