void split(GArgReader& args) { // Load GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); int pats = (int)pData->rows() - args.pop_uint(); if(pats < 0) ThrowError("out of range. The data only has ", to_str(pData->rows()), " rows."); const char* szFilename1 = args.pop_string(); const char* szFilename2 = args.pop_string(); unsigned int nSeed = getpid() * (unsigned int)time(NULL); bool shouldShuffle = false; while(args.size() > 0){ if(args.if_pop("-shuffle")){ shouldShuffle = true; }else if(args.if_pop("-seed")){ nSeed = args.pop_uint(); }else ThrowError("Invalid option: ", args.peek()); } // Shuffle if necessary GRand rng(nSeed); if(shouldShuffle){ pData->shuffle(rng); } // Split GMatrix other(pData->relation()); pData->splitBySize(&other, pats); pData->saveArff(szFilename1); other.saveArff(szFilename2); }
void transacc(GArgReader& args) { // Parse options unsigned int seed = getpid() * (unsigned int)time(NULL); while(args.next_is_flag()) { if(args.if_pop("-seed")) seed = args.pop_uint(); else ThrowError("Invalid crossvalidate option: ", args.peek()); } // Load the data if(args.size() < 1) ThrowError("No training set specified."); GMatrix* pTrain = loadData(args.pop_string()); Holder<GMatrix> hTrain(pTrain); if(args.size() < 1) ThrowError("No test set specified."); GMatrix* pTest = loadData(args.pop_string()); Holder<GMatrix> hTest(pTest); // Instantiate the recommender GRand prng(seed); GCollaborativeFilter* pModel = InstantiateAlgorithm(prng, args); Holder<GCollaborativeFilter> hModel(pModel); if(args.size() > 0) ThrowError("Superfluous argument: ", args.peek()); // Do cross-validation double mae; double mse = pModel->trainAndTest(*pTrain, *pTest, &mae); cout << "MSE=" << mse << ", MAE=" << mae << "\n"; }
void GRecommenderLib::transacc(GArgReader& args) { // Parse options unsigned int seed = getpid() * (unsigned int)time(NULL); while(args.next_is_flag()) { if(args.if_pop("-seed")) seed = args.pop_uint(); else throw Ex("Invalid crossvalidate option: ", args.peek()); } // Load the data if(args.size() < 1) throw Ex("No training set specified."); GMatrix train; loadData(train, args.pop_string()); if(args.size() < 1) throw Ex("No test set specified."); GMatrix test; loadData(test, args.pop_string()); // Instantiate the recommender GCollaborativeFilter* pModel = InstantiateAlgorithm(args); std::unique_ptr<GCollaborativeFilter> hModel(pModel); if(args.size() > 0) throw Ex("Superfluous argument: ", args.peek()); pModel->rand().setSeed(seed); // Do cross-validation double mae; double mse = pModel->trainAndTest(train, test, &mae); cout << "MSE=" << mse << ", MAE=" << mae << "\n"; }
void mergeVert(GArgReader& args) { GMatrix* pData1 = loadData(args.pop_string()); Holder<GMatrix> hData1(pData1); GMatrix* pData2 = loadData(args.pop_string()); Holder<GMatrix> hData2(pData2); pData1->mergeVert(pData2); pData1->print(cout); }
void zeroMean(GArgReader& args) { GMatrix* pA = loadData(args.pop_string()); Holder<GMatrix> hA(pA); if(args.size() > 0) ThrowError("Superfluous arg: ", args.pop_string()); pA->centerMeanAtOrigin(); pA->print(cout); }
void addMatrices(GArgReader& args) { GMatrix* pA = loadData(args.pop_string()); Holder<GMatrix> hA(pA); GMatrix* pB = loadData(args.pop_string()); Holder<GMatrix> hB(pB); pA->add(pB, false); pA->print(cout); }
void align(GArgReader& args) { GMatrix* pA = loadData(args.pop_string()); Holder<GMatrix> hA(pA); GMatrix* pB = loadData(args.pop_string()); Holder<GMatrix> hB(pB); GMatrix* pC = GMatrix::align(pA, pB); Holder<GMatrix> hC(pC); pC->print(cout); }
void multiplyScalar(GArgReader& args) { GMatrix* pA = loadData(args.pop_string()); Holder<GMatrix> hA(pA); double scale = args.pop_double(); if(args.size() > 0) ThrowError("Superfluous arg: ", args.pop_string()); pA->multiply(scale); pA->print(cout); }
void squaredDistance(GArgReader& args) { GMatrix* pA = loadData(args.pop_string()); Holder<GMatrix> hA(pA); GMatrix* pB = loadData(args.pop_string()); Holder<GMatrix> hB(pB); double d = pA->sumSquaredDifference(*pB, false); cout << "Sum squared distance: " << d << "\n"; cout << "Mean squared distance: " << (d / pA->rows()) << "\n"; cout << "Root mean squared distance: " << sqrt(d / pA->rows()) << "\n"; }
void LoadData(GArgReader &args, std::unique_ptr<GMatrix> &hOutput) { // Load the dataset by extension if(args.size() < 1) throw Ex("Expected the filename of a datset. (Found end of arguments.)"); const char* szFilename = args.pop_string(); PathData pd; GFile::parsePath(szFilename, &pd); GMatrix data; vector<size_t> abortedCols; vector<size_t> ambiguousCols; const char *input_type; if (args.next_is_flag() && args.if_pop("-input_type")) { input_type = args.pop_string(); } else { /* deduce it from extension (if any) */ input_type = szFilename + pd.extStart; if (*input_type != '.') /* no extension - assume ARFF */ input_type = "arff"; else input_type++; } // Now load the data if(_stricmp(input_type, "arff") == 0) { data.loadArff(szFilename); } else if(_stricmp(input_type, "csv") == 0) { GCSVParser parser; parser.parse(data, szFilename); cerr << "\nParsing Report:\n"; for(size_t i = 0; i < data.cols(); i++) cerr << to_str(i) << ") " << parser.report(i) << "\n"; } else if(_stricmp(input_type, "dat") == 0) { GCSVParser parser; parser.setSeparator('\0'); parser.parse(data, szFilename); cerr << "\nParsing Report:\n"; for(size_t i = 0; i < data.cols(); i++) cerr << to_str(i) << ") " << parser.report(i) << "\n"; } else { throw Ex("Unsupported file format: ", szFilename + pd.extStart); } // Split data into a feature matrix and a label matrix GMatrix* pFeatures = data.cloneSub(0, 0, data.rows(), data.cols()); hOutput.reset(pFeatures); }
void ManifoldSculpting(GArgReader& args) { // Load the file and params GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); unsigned int nSeed = getpid() * (unsigned int)time(NULL); GRand prng(nSeed); GNeighborFinder* pNF = instantiateNeighborFinder(pData, &prng, args); Holder<GNeighborFinder> hNF(pNF); size_t targetDims = args.pop_uint(); // Parse Options const char* szPreprocessedData = NULL; double scaleRate = 0.999; while(args.size() > 0) { if(args.if_pop("-seed")) prng.setSeed(args.pop_uint()); else if(args.if_pop("-continue")) szPreprocessedData = args.pop_string(); else if(args.if_pop("-scalerate")) scaleRate = args.pop_double(); else throw Ex("Invalid option: ", args.peek()); } // Load the hint data GMatrix* pDataHint = NULL; Holder<GMatrix> hDataHint(NULL); if(szPreprocessedData) { pDataHint = loadData(szPreprocessedData); hDataHint.reset(pDataHint); if(pDataHint->relation()->size() != targetDims) throw Ex("Wrong number of dims in the hint data"); if(pDataHint->rows() != pData->rows()) throw Ex("Wrong number of patterns in the hint data"); } // Transform the data GManifoldSculpting transform(pNF->neighborCount(), targetDims, &prng); transform.setSquishingRate(scaleRate); if(pDataHint) transform.setPreprocessedData(hDataHint.release()); transform.setNeighborFinder(pNF); GMatrix* pDataAfter = transform.doit(*pData); Holder<GMatrix> hDataAfter(pDataAfter); pDataAfter->print(cout); }
void singularValueDecomposition(GArgReader& args) { // Load GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); // Parse options string ufilename = "u.arff"; string sigmafilename; string vfilename = "v.arff"; int maxIters = 100; while(args.size() > 0) { if(args.if_pop("-ufilename")) ufilename = args.pop_string(); else if(args.if_pop("-sigmafilename")) sigmafilename = args.pop_string(); else if(args.if_pop("-vfilename")) vfilename = args.pop_string(); else if(args.if_pop("-maxiters")) maxIters = args.pop_uint(); else ThrowError("Invalid option: ", args.peek()); } GMatrix* pU; double* pDiag; GMatrix* pV; pData->singularValueDecomposition(&pU, &pDiag, &pV, false, maxIters); Holder<GMatrix> hU(pU); ArrayHolder<double> hDiag(pDiag); Holder<GMatrix> hV(pV); pU->saveArff(ufilename.c_str()); pV->saveArff(vfilename.c_str()); if(sigmafilename.length() > 0) { GMatrix sigma(pU->rows(), pV->rows()); sigma.setAll(0.0); size_t m = std::min(sigma.rows(), (size_t)sigma.cols()); for(size_t i = 0; i < m; i++) sigma.row(i)[i] = pDiag[i]; sigma.saveArff(sigmafilename.c_str()); } else { GVec::print(cout, 14, pDiag, std::min(pU->rows(), pV->rows())); cout << "\n"; } }
void AddIndexAttribute(GArgReader& args) { // Parse args const char* filename = args.pop_string(); double nStartValue = 0.0; double nIncrement = 1.0; while(args.size() > 0) { if(args.if_pop("-start")) nStartValue = args.pop_double(); else if(args.if_pop("-increment")) nIncrement = args.pop_double(); else ThrowError("Invalid option: ", args.peek()); } GMatrix* pData = loadData(filename); Holder<GMatrix> hData(pData); GArffRelation* pIndexRelation = new GArffRelation(); pIndexRelation->addAttribute("index", 0, NULL); sp_relation pIndexRel = pIndexRelation; GMatrix indexes(pIndexRel); indexes.newRows(pData->rows()); for(size_t i = 0; i < pData->rows(); i++) indexes.row(i)[0] = nStartValue + i * nIncrement; GMatrix* pUnified = GMatrix::mergeHoriz(&indexes, pData); Holder<GMatrix> hUnified(pUnified); pUnified->print(cout); }
void GRecommenderLib::precisionRecall(GArgReader& args) { // Parse options unsigned int seed = getpid() * (unsigned int)time(NULL); bool ideal = false; while(args.next_is_flag()) { if(args.if_pop("-seed")) seed = args.pop_uint(); else if(args.if_pop("-ideal")) ideal = true; else throw Ex("Invalid option: ", args.peek()); } // Load the data if(args.size() < 1) throw Ex("No dataset specified."); GMatrix data; loadData(data, args.pop_string()); // Instantiate the recommender GCollaborativeFilter* pModel = InstantiateAlgorithm(args); std::unique_ptr<GCollaborativeFilter> hModel(pModel); if(args.size() > 0) throw Ex("Superfluous argument: ", args.peek()); pModel->rand().setSeed(seed); // Generate precision-recall data GMatrix* pResults = pModel->precisionRecall(data, ideal); std::unique_ptr<GMatrix> hResults(pResults); pResults->deleteColumns(2, 1); // we don't need the false-positive rate column pResults->print(cout); }
void neighbors(GArgReader& args) { // Load the data GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); int neighborCount = args.pop_uint(); // Find the neighbors GKdTree neighborFinder(pData, neighborCount, NULL, true); GTEMPBUF(size_t, neighbors, neighborCount); GTEMPBUF(double, distances, neighborCount); double sumClosest = 0; double sumAll = 0; for(size_t i = 0; i < pData->rows(); i++) { neighborFinder.neighbors(neighbors, distances, i); neighborFinder.sortNeighbors(neighbors, distances); sumClosest += sqrt(distances[0]); for(int j = 0; j < neighborCount; j++) sumAll += sqrt(distances[j]); } cout.precision(14); cout << "average closest neighbor distance = " << (sumClosest / pData->rows()) << "\n"; cout << "average neighbor distance = " << (sumAll / (pData->rows() * neighborCount)) << "\n"; }
void nominalToCat(GArgReader& args) { // Load the file GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); // Parse Options int maxValues = 12; while(args.size() > 0) { if(args.if_pop("-maxvalues")) maxValues = args.pop_uint(); else ThrowError("Invalid option: ", args.peek()); } // Transform the data GNominalToCat transform(maxValues); transform.train(*pData); GMatrix* pDataNew = transform.transformBatch(*pData); Holder<GMatrix> hDataNew(pDataNew); // Print results pDataNew->print(cout); }
void enumerateValues(GArgReader& args) { GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); size_t col = args.pop_uint(); if(pData->relation()->valueCount(col) > 0) ((GArffRelation*)pData->relation().get())->setAttrValueCount(col, 0); else { size_t n = 0; map<double,size_t> themap; for(size_t i = 0; i < pData->rows(); i++) { double* pRow = pData->row(i); map<double,size_t>::iterator it = themap.find(pRow[col]); if(it == themap.end()) { themap[pRow[col]] = n; pRow[col] = (double)n; n++; } else pRow[col] = (double)it->second; } } pData->print(cout); }
void DropMissingValues(GArgReader& args) { GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); GRelation* pRelation = pData->relation().get(); size_t dims = pRelation->size(); for(size_t i = pData->rows() - 1; i < pData->rows(); i--) { double* pPat = pData->row(i); bool drop = false; for(size_t j = 0; j < dims; j++) { if(pRelation->valueCount(j) == 0) { if(pPat[j] == UNKNOWN_REAL_VALUE) { drop = true; break; } } else { if(pPat[j] == UNKNOWN_DISCRETE_VALUE) { drop = true; break; } } } if(drop) pData->deleteRow(i); } pData->print(cout); }
void correlation(GArgReader& args) { GMatrix* pA = loadData(args.pop_string()); Holder<GMatrix> hA(pA); int attr1 = args.pop_uint(); int attr2 = args.pop_uint(); // Parse Options bool aboutorigin = false; while(args.size() > 0) { if(args.if_pop("-aboutorigin")) aboutorigin = true; else ThrowError("Invalid option: ", args.peek()); } double m1, m2; if(aboutorigin) { m1 = 0; m2 = 0; } else { m1 = pA->mean(attr1); m2 = pA->mean(attr2); } double corr = pA->linearCorrelationCoefficient(attr1, m1, attr2, m2); cout.precision(14); cout << corr << "\n"; }
void cholesky(GArgReader& args) { GMatrix* pA = loadData(args.pop_string()); Holder<GMatrix> hA(pA); pA->cholesky(); pA->print(cout); }
void autoCorrelation(GArgReader& args) { GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); size_t lag = std::min((size_t)256, pData->rows() / 2); size_t dims = pData->cols(); GTEMPBUF(double, mean, dims); pData->centroid(mean); GMatrix ac(0, dims + 1); for(size_t i = 1; i <= lag; i++) { double* pRow = ac.newRow(); *(pRow++) = (double)i; for(size_t j = 0; j < dims; j++) { *pRow = 0; size_t k; for(k = 0; k + i < pData->rows(); k++) { double* pA = pData->row(k); double* pB = pData->row(k + i); *pRow += (pA[j] - mean[j]) * (pB[j] - mean[j]); } *pRow /= k; pRow++; } } ac.print(cout); }
void reducedRowEchelonForm(GArgReader& args) { GMatrix* pA = loadData(args.pop_string()); Holder<GMatrix> hA(pA); pA->toReducedRowEchelonForm(); pA->print(cout); }
void splitClass(GArgReader& args) { const char* filename = args.pop_string(); GMatrix* pData = loadData(filename); Holder<GMatrix> hData(pData); size_t classAttr = args.pop_uint(); bool dropClass = false; while(args.size() > 0) { if(args.if_pop("-dropclass")) dropClass = true; else ThrowError("Invalid option: ", args.peek()); } for(size_t i = 0; i < pData->relation()->valueCount(classAttr); i++) { GMatrix tmp(pData->relation(), pData->heap()); pData->splitByNominalValue(&tmp, classAttr, i); std::ostringstream oss; PathData pd; GFile::parsePath(filename, &pd); string fn; fn.assign(filename + pd.fileStart, pd.extStart - pd.fileStart); oss << fn << "_"; pData->relation()->printAttrValue(oss, classAttr, (double)i); oss << ".arff"; string s = oss.str(); if(dropClass) tmp.deleteColumn(classAttr); tmp.saveArff(s.c_str()); } }
void crossValidate(GArgReader& args) { // Parse options unsigned int seed = getpid() * (unsigned int)time(NULL); size_t folds = 2; while(args.next_is_flag()) { if(args.if_pop("-seed")) seed = args.pop_uint(); else if(args.if_pop("-folds")) folds = args.pop_uint(); else ThrowError("Invalid crossvalidate option: ", args.peek()); } if(folds < 2) ThrowError("There must be at least 2 folds."); // Load the data if(args.size() < 1) ThrowError("No dataset specified."); GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); // Instantiate the recommender GRand prng(seed); GCollaborativeFilter* pModel = InstantiateAlgorithm(prng, args); Holder<GCollaborativeFilter> hModel(pModel); if(args.size() > 0) ThrowError("Superfluous argument: ", args.peek()); // Do cross-validation double mae; double mse = pModel->crossValidate(*pData, folds, &mae); cout << "RMSE=" << sqrt(mse) << ", MSE=" << mse << ", MAE=" << mae << "\n"; }
void threshold(GArgReader& args){ GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); unsigned column=args.pop_uint(); if(column >= hData->cols()){ std::stringstream msg; if(hData->cols() >= 1){ msg << "The column to threshold is too large. It should be in " << "the range [0.." << (hData->cols()-1) << "]."; }else{ msg << "This data has no columns to threshold."; } ThrowError(msg.str()); } if(hData->relation()->valueCount(column) != 0){ ThrowError("Can only use threshold on continuous attributes."); } double value = args.pop_double(); //Do the actual thresholding for(size_t i = 0; i < hData->rows(); ++i){ double& v = hData->row(i)[column]; if(v <= value){ v = 0; }else { v = 1; } } //Print the data hData->print(cout); }
void fillMissingValues(GArgReader& args) { // Load GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); // Parse options unsigned int nSeed = getpid() * (unsigned int)time(NULL); bool random = false; while(args.size() > 0) { if(args.if_pop("-seed")) nSeed = args.pop_uint(); else if(args.if_pop("-random")) random = true; else ThrowError("Invalid option: ", args.peek()); } // Replace missing values and print GRand prng(nSeed); if(random) { for(size_t i = 0; i < pData->relation()->size(); i++) pData->replaceMissingValuesRandomly(i, &prng); } else { for(size_t i = 0; i < pData->relation()->size(); i++) pData->replaceMissingValuesWithBaseline(i); } pData->print(cout); }
void precisionRecall(GArgReader& args) { // Parse options unsigned int seed = getpid() * (unsigned int)time(NULL); bool ideal = false; while(args.next_is_flag()) { if(args.if_pop("-seed")) seed = args.pop_uint(); else if(args.if_pop("-ideal")) ideal = true; else ThrowError("Invalid option: ", args.peek()); } // Load the data if(args.size() < 1) ThrowError("No dataset specified."); GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); // Instantiate the recommender GRand prng(seed); GCollaborativeFilter* pModel = InstantiateAlgorithm(prng, args); Holder<GCollaborativeFilter> hModel(pModel); if(args.size() > 0) ThrowError("Superfluous argument: ", args.peek()); // Generate precision-recall data GMatrix* pResults = pModel->precisionRecall(*pData, ideal); Holder<GMatrix> hResults(pResults); pResults->deleteColumn(2); // we don't need the false-positive rate column pResults->print(cout); }
void addNoise(GArgReader& args) { GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); double dev = args.pop_double(); // Parse the options unsigned int seed = getpid() * (unsigned int)time(NULL); int excludeLast = 0; while(args.next_is_flag()) { if(args.if_pop("-seed")) seed = args.pop_uint(); else if(args.if_pop("-excludelast")) excludeLast = args.pop_uint(); else ThrowError("Invalid neighbor finder option: ", args.peek()); } GRand prng(seed); size_t cols = pData->cols() - excludeLast; for(size_t r = 0; r < pData->rows(); r++) { double* pRow = pData->row(r); for(size_t c = 0; c < cols; c++) *(pRow++) += dev * prng.normal(); } pData->print(cout); }
void kmeans(GArgReader& args) { // Load the file and params GMatrix data; loadData(data, args.pop_string()); int clusters = args.pop_uint(); // Parse Options unsigned int nSeed = getpid() * (unsigned int)time(NULL); size_t reps = 1; while(args.size() > 0) { if(args.if_pop("-seed")) nSeed = args.pop_uint(); else if(args.if_pop("-reps")) reps = args.pop_uint(); else throw Ex("Invalid option: ", args.peek()); } // Do the clustering GRand prng(nSeed); GKMeans clusterer(clusters, &prng); clusterer.setReps(reps); GMatrix* pOut = clusterer.reduce(data); std::unique_ptr<GMatrix> hOut(pOut); pOut->print(cout); }
void lle(GArgReader& args) { // Load the file and params GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); unsigned int nSeed = getpid() * (unsigned int)time(NULL); GRand prng(nSeed); GNeighborFinder* pNF = instantiateNeighborFinder(pData, &prng, args); Holder<GNeighborFinder> hNF(pNF); int targetDims = args.pop_uint(); // Parse Options while(args.size() > 0) { if(args.if_pop("-seed")) prng.setSeed(args.pop_uint()); else throw Ex("Invalid option: ", args.peek()); } // Transform the data GLLE transform(pNF->neighborCount(), targetDims, &prng); transform.setNeighborFinder(pNF); GMatrix* pDataAfter = transform.doit(*pData); Holder<GMatrix> hDataAfter(pDataAfter); pDataAfter->print(cout); }