void split(GArgReader& args) { // Load GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); int pats = (int)pData->rows() - args.pop_uint(); if(pats < 0) ThrowError("out of range. The data only has ", to_str(pData->rows()), " rows."); const char* szFilename1 = args.pop_string(); const char* szFilename2 = args.pop_string(); unsigned int nSeed = getpid() * (unsigned int)time(NULL); bool shouldShuffle = false; while(args.size() > 0){ if(args.if_pop("-shuffle")){ shouldShuffle = true; }else if(args.if_pop("-seed")){ nSeed = args.pop_uint(); }else ThrowError("Invalid option: ", args.peek()); } // Shuffle if necessary GRand rng(nSeed); if(shouldShuffle){ pData->shuffle(rng); } // Split GMatrix other(pData->relation()); pData->splitBySize(&other, pats); pData->saveArff(szFilename1); other.saveArff(szFilename2); }
void nominalToCat(GArgReader& args) { // Load the file GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); // Parse Options int maxValues = 12; while(args.size() > 0) { if(args.if_pop("-maxvalues")) maxValues = args.pop_uint(); else ThrowError("Invalid option: ", args.peek()); } // Transform the data GNominalToCat transform(maxValues); transform.train(*pData); GMatrix* pDataNew = transform.transformBatch(*pData); Holder<GMatrix> hDataNew(pDataNew); // Print results pDataNew->print(cout); }
void isomap(GArgReader& args) { // Load the file and params GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); unsigned int nSeed = getpid() * (unsigned int)time(NULL); GRand prng(nSeed); GNeighborFinder* pNF = instantiateNeighborFinder(pData, &prng, args); Holder<GNeighborFinder> hNF(pNF); int targetDims = args.pop_uint(); // Parse Options bool tolerant = false; while(args.size() > 0) { if(args.if_pop("-seed")) prng.setSeed(args.pop_uint()); else if(args.if_pop("-tolerant")) tolerant = true; else throw Ex("Invalid option: ", args.peek()); } // Transform the data GIsomap transform(pNF->neighborCount(), targetDims, &prng); transform.setNeighborFinder(pNF); if(tolerant) transform.dropDisconnectedPoints(); GMatrix* pDataAfter = transform.reduce(*pData); Holder<GMatrix> hDataAfter(pDataAfter); pDataAfter->print(cout); }
void lle(GArgReader& args) { // Load the file and params GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); unsigned int nSeed = getpid() * (unsigned int)time(NULL); GRand prng(nSeed); GNeighborFinder* pNF = instantiateNeighborFinder(pData, &prng, args); Holder<GNeighborFinder> hNF(pNF); int targetDims = args.pop_uint(); // Parse Options while(args.size() > 0) { if(args.if_pop("-seed")) prng.setSeed(args.pop_uint()); else throw Ex("Invalid option: ", args.peek()); } // Transform the data GLLE transform(pNF->neighborCount(), targetDims, &prng); transform.setNeighborFinder(pNF); GMatrix* pDataAfter = transform.doit(*pData); Holder<GMatrix> hDataAfter(pDataAfter); pDataAfter->print(cout); }
void splitClass(GArgReader& args) { const char* filename = args.pop_string(); GMatrix* pData = loadData(filename); Holder<GMatrix> hData(pData); size_t classAttr = args.pop_uint(); bool dropClass = false; while(args.size() > 0) { if(args.if_pop("-dropclass")) dropClass = true; else ThrowError("Invalid option: ", args.peek()); } for(size_t i = 0; i < pData->relation()->valueCount(classAttr); i++) { GMatrix tmp(pData->relation(), pData->heap()); pData->splitByNominalValue(&tmp, classAttr, i); std::ostringstream oss; PathData pd; GFile::parsePath(filename, &pd); string fn; fn.assign(filename + pd.fileStart, pd.extStart - pd.fileStart); oss << fn << "_"; pData->relation()->printAttrValue(oss, classAttr, (double)i); oss << ".arff"; string s = oss.str(); if(dropClass) tmp.deleteColumn(classAttr); tmp.saveArff(s.c_str()); } }
void breadthFirstUnfolding(GArgReader& args) { // Load the file and params GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); size_t nSeed = getpid() * (unsigned int)time(NULL); GRand prng(nSeed); GNeighborFinder* pNF = instantiateNeighborFinder(pData, &prng, args); Holder<GNeighborFinder> hNF(pNF); int targetDims = args.pop_uint(); // Parse Options size_t reps = 1; Holder<GMatrix> hControlData(NULL); while(args.size() > 0) { if(args.if_pop("-seed")) nSeed = args.pop_uint(); else if(args.if_pop("-reps")) reps = args.pop_uint(); else throw Ex("Invalid option: ", args.peek()); } // Transform the data GBreadthFirstUnfolding transform(reps, pNF->neighborCount(), targetDims); transform.rand().setSeed(nSeed); transform.setNeighborFinder(pNF); GMatrix* pDataAfter = transform.reduce(*pData); Holder<GMatrix> hDataAfter(pDataAfter); pDataAfter->print(cout); }
void curviness2(GArgReader& args) { GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); GNormalize norm; GMatrix* pDataNormalized = norm.doit(*pData); Holder<GMatrix> hDataNormalized(pDataNormalized); hData.reset(); pData = NULL; // Parse Options size_t maxEigs = 10; unsigned int seed = getpid() * (unsigned int)time(NULL); Holder<GMatrix> hControlData(NULL); while(args.size() > 0) { if(args.if_pop("-seed")) seed = args.pop_uint(); else if(args.if_pop("-maxeigs")) maxEigs = args.pop_uint(); else throw Ex("Invalid option: ", args.peek()); } GRand rand(seed); size_t targetDims = std::min(maxEigs, pDataNormalized->cols()); // Do linear PCA GNeuroPCA np1(targetDims, &rand); np1.setActivation(new GActivationIdentity()); np1.computeEigVals(); GMatrix* pResults1 = np1.doit(*pDataNormalized); Holder<GMatrix> hResults1(pResults1); double* pEigVals1 = np1.eigVals(); for(size_t i = 0; i + 1 < targetDims; i++) pEigVals1[i] = sqrt(pEigVals1[i]) - sqrt(pEigVals1[i + 1]); size_t max1 = GVec::indexOfMax(pEigVals1, targetDims - 1, &rand); double v1 = (double)max1; if(max1 > 0 && max1 + 2 < targetDims) v1 += (pEigVals1[max1 - 1] - pEigVals1[max1 + 1]) / (2.0 * (pEigVals1[max1 - 1] + pEigVals1[max1 + 1] - 2.0 * pEigVals1[max1])); // Do non-linear PCA GNeuroPCA np2(targetDims, &rand); np1.setActivation(new GActivationLogistic()); np2.computeEigVals(); GMatrix* pResults2 = np2.doit(*pDataNormalized); Holder<GMatrix> hResults2(pResults2); double* pEigVals2 = np2.eigVals(); for(size_t i = 0; i + 1 < targetDims; i++) pEigVals2[i] = sqrt(pEigVals2[i]) - sqrt(pEigVals2[i + 1]); size_t max2 = GVec::indexOfMax(pEigVals2, targetDims - 1, &rand); double v2 = (double)max2; if(max2 > 0 && max2 + 2 < targetDims) v2 += (pEigVals2[max2 - 1] - pEigVals2[max2 + 1]) / (2.0 * (pEigVals2[max2 - 1] + pEigVals2[max2 + 1] - 2.0 * pEigVals2[max2])); // Compute the difference in where the eigenvalues fall cout.precision(14); cout << (v1 - v2) << "\n"; }
void zeroMean(GArgReader& args) { GMatrix* pA = loadData(args.pop_string()); Holder<GMatrix> hA(pA); if(args.size() > 0) ThrowError("Superfluous arg: ", args.pop_string()); pA->centerMeanAtOrigin(); pA->print(cout); }
void multiplyScalar(GArgReader& args) { GMatrix* pA = loadData(args.pop_string()); Holder<GMatrix> hA(pA); double scale = args.pop_double(); if(args.size() > 0) ThrowError("Superfluous arg: ", args.pop_string()); pA->multiply(scale); pA->print(cout); }
void LoadData(GArgReader &args, std::unique_ptr<GMatrix> &hOutput) { // Load the dataset by extension if(args.size() < 1) throw Ex("Expected the filename of a datset. (Found end of arguments.)"); const char* szFilename = args.pop_string(); PathData pd; GFile::parsePath(szFilename, &pd); GMatrix data; vector<size_t> abortedCols; vector<size_t> ambiguousCols; const char *input_type; if (args.next_is_flag() && args.if_pop("-input_type")) { input_type = args.pop_string(); } else { /* deduce it from extension (if any) */ input_type = szFilename + pd.extStart; if (*input_type != '.') /* no extension - assume ARFF */ input_type = "arff"; else input_type++; } // Now load the data if(_stricmp(input_type, "arff") == 0) { data.loadArff(szFilename); } else if(_stricmp(input_type, "csv") == 0) { GCSVParser parser; parser.parse(data, szFilename); cerr << "\nParsing Report:\n"; for(size_t i = 0; i < data.cols(); i++) cerr << to_str(i) << ") " << parser.report(i) << "\n"; } else if(_stricmp(input_type, "dat") == 0) { GCSVParser parser; parser.setSeparator('\0'); parser.parse(data, szFilename); cerr << "\nParsing Report:\n"; for(size_t i = 0; i < data.cols(); i++) cerr << to_str(i) << ") " << parser.report(i) << "\n"; } else { throw Ex("Unsupported file format: ", szFilename + pd.extStart); } // Split data into a feature matrix and a label matrix GMatrix* pFeatures = data.cloneSub(0, 0, data.rows(), data.cols()); hOutput.reset(pFeatures); }
void ROC(GArgReader& args) { // Parse options unsigned int seed = getpid() * (unsigned int)time(NULL); bool ideal = false; while(args.next_is_flag()) { if(args.if_pop("-seed")) seed = args.pop_uint(); else if(args.if_pop("-ideal")) ideal = true; else ThrowError("Invalid option: ", args.peek()); } // Load the data if(args.size() < 1) ThrowError("No dataset specified."); GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); // Instantiate the recommender GRand prng(seed); GCollaborativeFilter* pModel = InstantiateAlgorithm(prng, args); Holder<GCollaborativeFilter> hModel(pModel); if(args.size() > 0) ThrowError("Superfluous argument: ", args.peek()); // Generate ROC data GMatrix* pResults = pModel->precisionRecall(*pData, ideal); Holder<GMatrix> hResults(pResults); double auc = GCollaborativeFilter::areaUnderCurve(*pResults); pResults->deleteColumn(1); // we don't need the precision column pResults->swapColumns(0, 1); cout << "% Area Under the Curve = " << auc << "\n"; pResults->print(cout); }
void ManifoldSculpting(GArgReader& args) { // Load the file and params GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); unsigned int nSeed = getpid() * (unsigned int)time(NULL); GRand prng(nSeed); GNeighborFinder* pNF = instantiateNeighborFinder(pData, &prng, args); Holder<GNeighborFinder> hNF(pNF); size_t targetDims = args.pop_uint(); // Parse Options const char* szPreprocessedData = NULL; double scaleRate = 0.999; while(args.size() > 0) { if(args.if_pop("-seed")) prng.setSeed(args.pop_uint()); else if(args.if_pop("-continue")) szPreprocessedData = args.pop_string(); else if(args.if_pop("-scalerate")) scaleRate = args.pop_double(); else throw Ex("Invalid option: ", args.peek()); } // Load the hint data GMatrix* pDataHint = NULL; Holder<GMatrix> hDataHint(NULL); if(szPreprocessedData) { pDataHint = loadData(szPreprocessedData); hDataHint.reset(pDataHint); if(pDataHint->relation()->size() != targetDims) throw Ex("Wrong number of dims in the hint data"); if(pDataHint->rows() != pData->rows()) throw Ex("Wrong number of patterns in the hint data"); } // Transform the data GManifoldSculpting transform(pNF->neighborCount(), targetDims, &prng); transform.setSquishingRate(scaleRate); if(pDataHint) transform.setPreprocessedData(hDataHint.release()); transform.setNeighborFinder(pNF); GMatrix* pDataAfter = transform.doit(*pData); Holder<GMatrix> hDataAfter(pDataAfter); pDataAfter->print(cout); }
void singularValueDecomposition(GArgReader& args) { // Load GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); // Parse options string ufilename = "u.arff"; string sigmafilename; string vfilename = "v.arff"; int maxIters = 100; while(args.size() > 0) { if(args.if_pop("-ufilename")) ufilename = args.pop_string(); else if(args.if_pop("-sigmafilename")) sigmafilename = args.pop_string(); else if(args.if_pop("-vfilename")) vfilename = args.pop_string(); else if(args.if_pop("-maxiters")) maxIters = args.pop_uint(); else ThrowError("Invalid option: ", args.peek()); } GMatrix* pU; double* pDiag; GMatrix* pV; pData->singularValueDecomposition(&pU, &pDiag, &pV, false, maxIters); Holder<GMatrix> hU(pU); ArrayHolder<double> hDiag(pDiag); Holder<GMatrix> hV(pV); pU->saveArff(ufilename.c_str()); pV->saveArff(vfilename.c_str()); if(sigmafilename.length() > 0) { GMatrix sigma(pU->rows(), pV->rows()); sigma.setAll(0.0); size_t m = std::min(sigma.rows(), (size_t)sigma.cols()); for(size_t i = 0; i < m; i++) sigma.row(i)[i] = pDiag[i]; sigma.saveArff(sigmafilename.c_str()); } else { GVec::print(cout, 14, pDiag, std::min(pU->rows(), pV->rows())); cout << "\n"; } }
void GRecommenderLib::crossValidate(GArgReader& args) { // Parse options unsigned int seed = getpid() * (unsigned int)time(NULL); size_t folds = 2; while(args.next_is_flag()) { if(args.if_pop("-seed")) seed = args.pop_uint(); else if(args.if_pop("-folds")) folds = args.pop_uint(); else throw Ex("Invalid crossvalidate option: ", args.peek()); } if(folds < 2) throw Ex("There must be at least 2 folds."); // Load the data if(args.size() < 1) throw Ex("No dataset specified."); GMatrix data; loadData(data, args.pop_string()); // Instantiate the recommender GCollaborativeFilter* pModel = InstantiateAlgorithm(args); std::unique_ptr<GCollaborativeFilter> hModel(pModel); if(args.size() > 0) throw Ex("Superfluous argument: ", args.peek()); pModel->rand().setSeed(seed); // Do cross-validation double mae; double mse; mse = pModel->crossValidate(data, folds, &mae); cout << "RMSE=" << sqrt(mse) << ", MSE=" << mse << ", MAE=" << mae << "\n"; }
void mergeHoriz(GArgReader& args) { GMatrix* pData1 = loadData(args.pop_string()); Holder<GMatrix> hData1(pData1); GMatrix* pMerged = pData1; Holder<GMatrix> hMerged(NULL); while(args.size() > 0) { GMatrix* pData2 = loadData(args.pop_string()); Holder<GMatrix> hData2(pData2); if(pMerged->rows() != pData2->rows()) ThrowError("The datasets must have the same number of rows"); pMerged = GMatrix::mergeHoriz(pMerged, pData2); hMerged.reset(pMerged); } pMerged->print(cout); }
void blendEmbeddings(GArgReader& args) { // Load the files and params GMatrix* pDataOrig = loadData(args.pop_string()); Holder<GMatrix> hDataOrig(pDataOrig); unsigned int seed = getpid() * (unsigned int)time(NULL); GRand prng(seed); GNeighborFinder* pNF = instantiateNeighborFinder(pDataOrig, &prng, args); Holder<GNeighborFinder> hNF(pNF); GMatrix* pDataA = loadData(args.pop_string()); Holder<GMatrix> hDataA(pDataA); GMatrix* pDataB = loadData(args.pop_string()); Holder<GMatrix> hDataB(pDataB); if(pDataA->rows() != pDataOrig->rows() || pDataB->rows() != pDataOrig->rows()) throw Ex("mismatching number of rows"); if(pDataA->cols() != pDataB->cols()) throw Ex("mismatching number of cols"); // Parse Options while(args.size() > 0) { if(args.if_pop("-seed")) prng.setSeed(args.pop_uint()); else throw Ex("Invalid option: ", args.peek()); } // Get a neighbor table if(!pNF->isCached()) { GNeighborFinderCacheWrapper* pNF2 = new GNeighborFinderCacheWrapper(hNF.release(), true); hNF.reset(pNF2); pNF = pNF2; } ((GNeighborFinderCacheWrapper*)pNF)->fillCache(); size_t* pNeighborTable = ((GNeighborFinderCacheWrapper*)pNF)->cache(); // Do the blending size_t startPoint = (size_t)prng.next(pDataA->rows()); double* pRatios = new double[pDataA->rows()]; ArrayHolder<double> hRatios(pRatios); GVec::setAll(pRatios, 0.5, pDataA->rows()); GMatrix* pDataC = GManifold::blendEmbeddings(pDataA, pRatios, pDataB, pNF->neighborCount(), pNeighborTable, startPoint); Holder<GMatrix> hDataC(pDataC); pDataC->print(cout); }
void Discretize(GArgReader& args) { // Load the file GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); // Parse Options size_t nFirst = 0; size_t nLast = pData->relation()->size() - 1; size_t nBuckets = std::max(2, (int)floor(sqrt((double)pData->rows() + 0.5))); while(args.size() > 0) { if(args.if_pop("-buckets")) nBuckets = args.pop_uint(); else if(args.if_pop("-colrange")) { nFirst = args.pop_uint(); nLast = args.pop_uint(); } else ThrowError("Invalid option: ", args.peek()); } if(nFirst < 0 || nLast >= pData->relation()->size() || nLast < nFirst) ThrowError("column index out of range"); // Discretize the continuous attributes in the specified range for(size_t i = nFirst; i <= nLast; i++) { if(pData->relation()->valueCount(i) != 0) continue; double min, range; pData->minAndRange(i, &min, &range); for(size_t j = 0; j < pData->rows(); j++) { double* pPat = pData->row(j); pPat[i] = (double)std::max((size_t)0, std::min(nBuckets - 1, (size_t)floor(((pPat[i] - min) * nBuckets) / range))); } ((GArffRelation*)pData->relation().get())->setAttrValueCount(i, nBuckets); } // Print results pData->print(cout); }
void multiDimensionalScaling(GArgReader& args) { GRand prng(0); GMatrix* pDistances = loadData(args.pop_string()); int targetDims = args.pop_uint(); // Parse Options bool useSquaredDistances = false; while(args.size() > 0) { if(args.if_pop("-squareddistances")) useSquaredDistances = true; else throw Ex("Invalid option: ", args.peek()); } GMatrix* pResults = GManifold::multiDimensionalScaling(pDistances, targetDims, &prng, useSquaredDistances); Holder<GMatrix> hResults(pResults); pResults->print(cout); }
void Shuffle(GArgReader& args) { // Load GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); // Parse options unsigned int nSeed = getpid() * (unsigned int)time(NULL); while(args.size() > 0) { if(args.if_pop("-seed")) nSeed = args.pop_uint(); else ThrowError("Invalid option: ", args.peek()); } // Shuffle and print GRand prng(nSeed); pData->shuffle(prng); pData->print(cout); }
void Import(GArgReader& args) { // Load the file size_t len; const char* filename = args.pop_string(); char* pFile = GFile::loadFile(filename, &len); ArrayHolder<char> hFile(pFile); // Parse Options char separator = ','; bool tolerant = false; bool columnNamesInFirstRow = false; while(args.size() > 0) { if(args.if_pop("-tab")) separator = '\t'; else if(args.if_pop("-space")) separator = ' '; else if(args.if_pop("-whitespace")) separator = '\0'; else if(args.if_pop("-semicolon")) separator = ';'; else if(args.if_pop("-separator")) separator = args.pop_string()[0]; else if(args.if_pop("-tolerant")) tolerant = true; else if(args.if_pop("-columnnames")) columnNamesInFirstRow = true; else ThrowError("Invalid option: ", args.peek()); } // Parse the file GMatrix* pData = GMatrix::parseCsv(pFile, len, separator, columnNamesInFirstRow, tolerant); Holder<GMatrix> hData(pData); ((GArffRelation*)pData->relation().get())->setName(filename); // Print the data pData->print(cout); }
void transition(GArgReader& args) { // Load the input data GMatrix* pActions = loadData(args.pop_string()); Holder<GMatrix> hActions(pActions); GMatrix* pState = loadData(args.pop_string()); Holder<GMatrix> hState(pState); if(pState->rows() != pActions->rows()) ThrowError("Expected the same number of rows in both datasets"); // Parse options bool delta = false; while(args.size() > 0) { if(args.if_pop("-delta")) delta = true; else ThrowError("Invalid option: ", args.peek()); } // Make the output data size_t actionDims = pActions->cols(); size_t stateDims = pState->cols(); GMixedRelation* pRelation = new GMixedRelation(); sp_relation pRel = pRelation; pRelation->addAttrs(pActions->relation().get()); pRelation->addAttrs(stateDims + stateDims, 0); GMatrix* pTransition = new GMatrix(pRel); pTransition->newRows(pActions->rows() - 1); for(size_t i = 0; i < pActions->rows() - 1; i++) { double* pOut = pTransition->row(i); GVec::copy(pOut, pActions->row(i), actionDims); GVec::copy(pOut + actionDims, pState->row(i), stateDims); GVec::copy(pOut + actionDims + stateDims, pState->row(i + 1), stateDims); if(delta) GVec::subtract(pOut + actionDims + stateDims, pState->row(i), stateDims); } pTransition->print(cout); }
void Export(GArgReader& args) { // Load GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); // Parse options const char* separator = ","; while(args.size() > 0) { if(args.if_pop("-tab")) separator = " "; else if(args.if_pop("-space")) separator = " "; else ThrowError("Invalid option: ", args.peek()); } // Print for(size_t i = 0; i < pData->rows(); i++) pData->relation()->printRow(cout, pData->row(i), separator); }
void splitFold(GArgReader& args) { // Load GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); size_t fold = args.pop_uint(); size_t folds = args.pop_uint(); if(fold >= folds) ThrowError("fold index out of range. It must be less than the total number of folds."); // Options string filenameTrain = "train.arff"; string filenameTest = "test.arff"; while(args.size() > 0) { if(args.if_pop("-out")) { filenameTrain = args.pop_string(); filenameTest = args.pop_string(); } else ThrowError("Invalid option: ", args.peek()); } // Copy relevant portions of the data GMatrix train(pData->relation()); GMatrix test(pData->relation()); size_t begin = pData->rows() * fold / folds; size_t end = pData->rows() * (fold + 1) / folds; for(size_t i = 0; i < begin; i++) train.copyRow(pData->row(i)); for(size_t i = begin; i < end; i++) test.copyRow(pData->row(i)); for(size_t i = end; i < pData->rows(); i++) train.copyRow(pData->row(i)); train.saveArff(filenameTrain.c_str()); test.saveArff(filenameTest.c_str()); }
void normalize(GArgReader& args) { GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); double min = 0.0; double max = 1.0; while(args.size() > 0) { if(args.if_pop("-range")) { min = args.pop_double(); max = args.pop_double(); } else ThrowError("Invalid option: ", args.peek()); } GNormalize transform(min, max); transform.train(*pData); GMatrix* pOut = transform.transformBatch(*pData); Holder<GMatrix> hOut(pOut); pOut->print(cout); }
void multiplyMatrices(GArgReader& args) { GMatrix* pA = loadData(args.pop_string()); Holder<GMatrix> hA(pA); GMatrix* pB = loadData(args.pop_string()); Holder<GMatrix> hB(pB); // Parse Options bool transposeA = false; bool transposeB = false; while(args.size() > 0) { if(args.if_pop("-transposea")) transposeA = true; else if(args.if_pop("-transposeb")) transposeB = true; else ThrowError("Invalid option: ", args.peek()); } GMatrix* pC = GMatrix::multiply(*pA, *pB, transposeA, transposeB); Holder<GMatrix> hC(pC); pC->print(cout); }
void SortByAttribute(GArgReader& args) { GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); size_t nAttr = args.pop_uint(); size_t attrCount = pData->relation()->size(); if(nAttr >= attrCount) ThrowError("Index out of range"); // Parse options bool descending = false; while(args.size() > 0) { if(args.if_pop("-descending")) descending = true; else ThrowError("Invalid option: ", args.peek()); } pData->sort(nAttr); if(descending) pData->reverseRows(); pData->print(cout); }
void Extrapolate(GArgReader &args) { // Load the model if(args.size() < 1) { throw Ex("Model not specified."); } GDom doc; doc.loadJson(args.pop_string()); GLearnerLoader ll(true); GSupervisedLearner *pLearner = ll.loadLearner(doc.root()); std::unique_ptr<GSupervisedLearner> hLearner(pLearner); // Parse options double start = 1.0; double length = 1.0; double step = 0.0002; bool useFeatures = false; bool outputFeatures = true; GNeuralDecomposition *nd = (GNeuralDecomposition *) pLearner; std::unique_ptr<GMatrix> hFeatures; while(args.next_is_flag()) { if(args.if_pop("-start")) { start = args.pop_double(); } else if(args.if_pop("-length")) { length = args.pop_double(); } else if(args.if_pop("-step")) { step = args.pop_double(); } else if(args.if_pop("-features")) { LoadData(args, hFeatures); useFeatures = true; } else if(args.if_pop("-outputFeatures")) { outputFeatures = true; } else { throw Ex("Invalid option: ", args.peek()); } } // Extrapolate GMatrix *pOutput; if(useFeatures) pOutput = nd->extrapolate(*hFeatures.get()); else pOutput = nd->extrapolate(start, length, step, outputFeatures); std::unique_ptr<GMatrix> hOutput(pOutput); // Output predictions pOutput->print(cout); }
void significance(GArgReader& args) { GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); int attr1 = args.pop_uint(); int attr2 = args.pop_uint(); // Parse options double tolerance = 0.001; while(args.size() > 0) { if(args.if_pop("-tol")) tolerance = args.pop_double(); else ThrowError("Invalid option: ", args.peek()); } // Print some basic stats cout.precision(8); { cout << "### Some basic stats\n"; cout << "Medians = " << pData->median(attr1) << ", " << pData->median(attr2) << "\n"; double mean1 = pData->mean(attr1); double mean2 = pData->mean(attr2); cout << "Means = " << mean1 << ", " << mean2 << "\n"; double var1 = pData->variance(attr1, mean1); double var2 = pData->variance(attr2, mean2); cout << "Standard deviations = " << sqrt(var1) << ", " << sqrt(var2) << "\n"; int less = 0; int eq = 0; int more = 0; for(size_t i = 0; i < pData->rows(); i++) { double* pRow = pData->row(i); if(std::abs(pRow[attr1] - pRow[attr2]) < tolerance) eq++; else if(pRow[attr1] < pRow[attr2]) less++; else more++; } cout << less << " less, " << eq << " same, " << more << " greater\n"; } // Perform the significance tests { cout << "\n### Paired T-test\n"; size_t v; double t; pData->pairedTTest(&v, &t, attr1, attr2, false); double p = GMath::tTestAlphaValue(v, t); cout << "v=" << v << ", t=" << t << ", p=" << p << "\n"; } { cout << "\n### Paired T-test with normalized values\n"; size_t v; double t; pData->pairedTTest(&v, &t, attr1, attr2, true); double p = GMath::tTestAlphaValue(v, t); cout << "v=" << v << ", t=" << t << ", p=" << p << "\n"; } { cout << "\n### Wilcoxon Signed Ranks Test"; int num; double wMinus, wPlus; pData->wilcoxonSignedRanksTest(attr1, attr2, tolerance, &num, &wMinus, &wPlus); cout << "Number of signed ranks: " << num << "\n"; double w_min = std::min(wMinus, wPlus); double w_sum = wPlus - wMinus; cout << "W- = " << wMinus << ", W+ = " << wPlus << ", W_min = " << w_min << ", W_sum = " << w_sum << "\n"; double p_min = 0.5 * GMath::wilcoxonPValue(num, w_min); if(num < 10) cout << "Because the number of signed ranks is small, you should use a lookup table, rather than rely on the normal approximation for the P-value.\n"; cout << "One-tailed P-value (for directional comparisons) computed with a normal approximation using W_min = " << 0.5 * p_min << "\n"; cout << "Two-tailed P-value (for non-directional comparisons) computed with a normal approximation using W_min = " << p_min << "\n"; cout << "To show that something is \"better\" than something else, use the one-tailed P-value.\n"; cout << "Commonly, a P-value less that 0.05 is considered to be significant.\n"; /* double p_sum = GMath::wilcoxonPValue(num, w_sum); cout << "Directional (one-tailed) P-value computed with W_sum = " << p_sum << "\n"; */ } }
void fillMissingValues(GArgReader& args) { unsigned int seed = getpid() * (unsigned int)time(NULL); while(args.next_is_flag()) { if(args.if_pop("-seed")) seed = args.pop_uint(); else ThrowError("Invalid option: ", args.peek()); } // Load the data and the filter GMatrix* pDataOrig = GMatrix::loadArff(args.pop_string()); Holder<GMatrix> hDataOrig(pDataOrig); sp_relation pOrigRel = pDataOrig->relation(); GRand prng(seed); GCollaborativeFilter* pModel = InstantiateAlgorithm(prng, args); Holder<GCollaborativeFilter> hModel(pModel); if(args.size() > 0) ThrowError("Superfluous argument: ", args.peek()); // Convert to all normalized real values GNominalToCat* pNtc = new GNominalToCat(); GTwoWayTransformChainer filter(new GNormalize(), pNtc); pNtc->preserveUnknowns(); filter.train(*pDataOrig); GMatrix* pData = filter.transformBatch(*pDataOrig); Holder<GMatrix> hData(pData); hDataOrig.release(); pDataOrig = NULL; // Convert to 3-column form GMatrix* pMatrix = new GMatrix(0, 3); Holder<GMatrix> hMatrix(pMatrix); size_t dims = pData->cols(); for(size_t i = 0; i < pData->rows(); i++) { double* pRow = pData->row(i); for(size_t j = 0; j < dims; j++) { if(*pRow != UNKNOWN_REAL_VALUE) { double* pVec = pMatrix->newRow(); pVec[0] = i; pVec[1] = j; pVec[2] = *pRow; } pRow++; } } // Train the collaborative filter pModel->train(*pMatrix); hMatrix.release(); pMatrix = NULL; // Predict values for missing elements for(size_t i = 0; i < pData->rows(); i++) { double* pRow = pData->row(i); for(size_t j = 0; j < dims; j++) { if(*pRow == UNKNOWN_REAL_VALUE) *pRow = pModel->predict(i, j); GAssert(*pRow != UNKNOWN_REAL_VALUE); pRow++; } } // Convert the data back to its original form GMatrix* pOut = filter.untransformBatch(*pData); pOut->setRelation(pOrigRel); pOut->print(cout); }
void MeasureMeanSquaredError(GArgReader& args) { // Load the first file GMatrix* pData1 = loadData(args.pop_string()); Holder<GMatrix> hData1(pData1); // Load the second file GMatrix* pData2 = loadData(args.pop_string()); Holder<GMatrix> hData2(pData2); // check sizes if(pData1->relation()->size() != pData2->relation()->size()) ThrowError("The datasets must have the same number of dims"); if(pData1->rows() != pData2->rows()) ThrowError("The datasets must have the same size"); // Parse Options bool fit = false; bool sumOverAttributes = false; while(args.size() > 0) { if(args.if_pop("-fit")) fit = true; else if(args.if_pop("-sum")) sumOverAttributes = true; else ThrowError("Invalid option: ", args.peek()); } size_t dims = pData1->relation()->size(); if(fit) { FitDataCritic critic(pData1, pData2, dims); GHillClimber search(&critic); double dPrevError; double dError = search.iterate(); cerr.precision(14); cerr << dError << "\n"; cerr.flush(); while(true) { dPrevError = dError; for(int i = 1; i < 30; i++) search.iterate(); dError = search.iterate(); cerr << dError << "\n"; cerr.flush(); if((dPrevError - dError) / dPrevError < 1e-10) break; } critic.ShowResults(search.currentVector(), sumOverAttributes); } else { // Compute mean squared error GTEMPBUF(double, results, dims); ComputeMeanSquaredError(pData1, pData2, dims, results); cout.precision(14); if(sumOverAttributes) cout << GVec::sumElements(results, dims); else GVec::print(cout, 14, results, dims); } cout << "\n"; }