void crossValidate(GArgReader& args) { // Parse options unsigned int seed = getpid() * (unsigned int)time(NULL); size_t folds = 2; while(args.next_is_flag()) { if(args.if_pop("-seed")) seed = args.pop_uint(); else if(args.if_pop("-folds")) folds = args.pop_uint(); else ThrowError("Invalid crossvalidate option: ", args.peek()); } if(folds < 2) ThrowError("There must be at least 2 folds."); // Load the data if(args.size() < 1) ThrowError("No dataset specified."); GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); // Instantiate the recommender GRand prng(seed); GCollaborativeFilter* pModel = InstantiateAlgorithm(prng, args); Holder<GCollaborativeFilter> hModel(pModel); if(args.size() > 0) ThrowError("Superfluous argument: ", args.peek()); // Do cross-validation double mae; double mse = pModel->crossValidate(*pData, folds, &mae); cout << "RMSE=" << sqrt(mse) << ", MSE=" << mse << ", MAE=" << mae << "\n"; }
void precisionRecall(GArgReader& args) { // Parse options unsigned int seed = getpid() * (unsigned int)time(NULL); bool ideal = false; while(args.next_is_flag()) { if(args.if_pop("-seed")) seed = args.pop_uint(); else if(args.if_pop("-ideal")) ideal = true; else ThrowError("Invalid option: ", args.peek()); } // Load the data if(args.size() < 1) ThrowError("No dataset specified."); GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); // Instantiate the recommender GRand prng(seed); GCollaborativeFilter* pModel = InstantiateAlgorithm(prng, args); Holder<GCollaborativeFilter> hModel(pModel); if(args.size() > 0) ThrowError("Superfluous argument: ", args.peek()); // Generate precision-recall data GMatrix* pResults = pModel->precisionRecall(*pData, ideal); Holder<GMatrix> hResults(pResults); pResults->deleteColumn(2); // we don't need the false-positive rate column pResults->print(cout); }
void GRecommenderLib::precisionRecall(GArgReader& args) { // Parse options unsigned int seed = getpid() * (unsigned int)time(NULL); bool ideal = false; while(args.next_is_flag()) { if(args.if_pop("-seed")) seed = args.pop_uint(); else if(args.if_pop("-ideal")) ideal = true; else throw Ex("Invalid option: ", args.peek()); } // Load the data if(args.size() < 1) throw Ex("No dataset specified."); GMatrix data; loadData(data, args.pop_string()); // Instantiate the recommender GCollaborativeFilter* pModel = InstantiateAlgorithm(args); std::unique_ptr<GCollaborativeFilter> hModel(pModel); if(args.size() > 0) throw Ex("Superfluous argument: ", args.peek()); pModel->rand().setSeed(seed); // Generate precision-recall data GMatrix* pResults = pModel->precisionRecall(data, ideal); std::unique_ptr<GMatrix> hResults(pResults); pResults->deleteColumns(2, 1); // we don't need the false-positive rate column pResults->print(cout); }
void GRecommenderLib::transacc(GArgReader& args) { // Parse options unsigned int seed = getpid() * (unsigned int)time(NULL); while(args.next_is_flag()) { if(args.if_pop("-seed")) seed = args.pop_uint(); else throw Ex("Invalid crossvalidate option: ", args.peek()); } // Load the data if(args.size() < 1) throw Ex("No training set specified."); GMatrix train; loadData(train, args.pop_string()); if(args.size() < 1) throw Ex("No test set specified."); GMatrix test; loadData(test, args.pop_string()); // Instantiate the recommender GCollaborativeFilter* pModel = InstantiateAlgorithm(args); std::unique_ptr<GCollaborativeFilter> hModel(pModel); if(args.size() > 0) throw Ex("Superfluous argument: ", args.peek()); pModel->rand().setSeed(seed); // Do cross-validation double mae; double mse = pModel->trainAndTest(train, test, &mae); cout << "MSE=" << mse << ", MAE=" << mae << "\n"; }
void transacc(GArgReader& args) { // Parse options unsigned int seed = getpid() * (unsigned int)time(NULL); while(args.next_is_flag()) { if(args.if_pop("-seed")) seed = args.pop_uint(); else ThrowError("Invalid crossvalidate option: ", args.peek()); } // Load the data if(args.size() < 1) ThrowError("No training set specified."); GMatrix* pTrain = loadData(args.pop_string()); Holder<GMatrix> hTrain(pTrain); if(args.size() < 1) ThrowError("No test set specified."); GMatrix* pTest = loadData(args.pop_string()); Holder<GMatrix> hTest(pTest); // Instantiate the recommender GRand prng(seed); GCollaborativeFilter* pModel = InstantiateAlgorithm(prng, args); Holder<GCollaborativeFilter> hModel(pModel); if(args.size() > 0) ThrowError("Superfluous argument: ", args.peek()); // Do cross-validation double mae; double mse = pModel->trainAndTest(*pTrain, *pTest, &mae); cout << "MSE=" << mse << ", MAE=" << mae << "\n"; }
void addNoise(GArgReader& args) { GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); double dev = args.pop_double(); // Parse the options unsigned int seed = getpid() * (unsigned int)time(NULL); int excludeLast = 0; while(args.next_is_flag()) { if(args.if_pop("-seed")) seed = args.pop_uint(); else if(args.if_pop("-excludelast")) excludeLast = args.pop_uint(); else ThrowError("Invalid neighbor finder option: ", args.peek()); } GRand prng(seed); size_t cols = pData->cols() - excludeLast; for(size_t r = 0; r < pData->rows(); r++) { double* pRow = pData->row(r); for(size_t c = 0; c < cols; c++) *(pRow++) += dev * prng.normal(); } pData->print(cout); }
void LoadData(GArgReader &args, std::unique_ptr<GMatrix> &hOutput) { // Load the dataset by extension if(args.size() < 1) throw Ex("Expected the filename of a datset. (Found end of arguments.)"); const char* szFilename = args.pop_string(); PathData pd; GFile::parsePath(szFilename, &pd); GMatrix data; vector<size_t> abortedCols; vector<size_t> ambiguousCols; const char *input_type; if (args.next_is_flag() && args.if_pop("-input_type")) { input_type = args.pop_string(); } else { /* deduce it from extension (if any) */ input_type = szFilename + pd.extStart; if (*input_type != '.') /* no extension - assume ARFF */ input_type = "arff"; else input_type++; } // Now load the data if(_stricmp(input_type, "arff") == 0) { data.loadArff(szFilename); } else if(_stricmp(input_type, "csv") == 0) { GCSVParser parser; parser.parse(data, szFilename); cerr << "\nParsing Report:\n"; for(size_t i = 0; i < data.cols(); i++) cerr << to_str(i) << ") " << parser.report(i) << "\n"; } else if(_stricmp(input_type, "dat") == 0) { GCSVParser parser; parser.setSeparator('\0'); parser.parse(data, szFilename); cerr << "\nParsing Report:\n"; for(size_t i = 0; i < data.cols(); i++) cerr << to_str(i) << ") " << parser.report(i) << "\n"; } else { throw Ex("Unsupported file format: ", szFilename + pd.extStart); } // Split data into a feature matrix and a label matrix GMatrix* pFeatures = data.cloneSub(0, 0, data.rows(), data.cols()); hOutput.reset(pFeatures); }
void dropRandomValues(GArgReader& args) { GMatrix* pData = loadData(args.pop_string()); double portion = args.pop_double(); // Parse the options unsigned int seed = getpid() * (unsigned int)time(NULL); while(args.next_is_flag()) { if(args.if_pop("-seed")) seed = args.pop_uint(); else ThrowError("Invalid option: ", args.peek()); } GRand rand(seed); size_t n = pData->rows() * pData->cols(); size_t k = size_t(portion * n); for(size_t i = 0; i < pData->cols(); i++) { size_t vals = pData->relation()->valueCount(i); if(vals == 0) { for(size_t j = 0; j < pData->rows(); j++) { if(rand.next(n) < k) { pData->row(j)[i] = UNKNOWN_REAL_VALUE; k--; } n--; } } else { for(size_t j = 0; j < pData->rows(); j++) { if(rand.next(n) < k) { pData->row(j)[i] = UNKNOWN_DISCRETE_VALUE; k--; } n--; } } } pData->print(cout); }
void attributeSelector(GArgReader& args) { // Load the data size_t labelDims; std::vector<size_t> originalIndices; GMatrix data; loadDataWithSwitches(data, args, labelDims, originalIndices); // Parse the options unsigned int seed = getpid() * (unsigned int)time(NULL); int targetFeatures = 1; string outFilename = ""; while(args.next_is_flag()) { if(args.if_pop("-seed")) seed = args.pop_uint(); else if(args.if_pop("-out")) { targetFeatures = args.pop_uint(); outFilename = args.pop_string(); } else throw Ex("Invalid neighbor finder option: ", args.peek()); } // Do the attribute selection GRand prng(seed); GAttributeSelector as(labelDims, targetFeatures, &prng); if(outFilename.length() > 0) { as.train(data); GMatrix* pDataOut = as.transformBatch(data); Holder<GMatrix> hDataOut(pDataOut); cout << "Reduced data saved to " << outFilename.c_str() << ".\n"; pDataOut->saveArff(outFilename.c_str()); } else as.train(data); cout << "\nAttribute rankings from most salient to least salient. (Attributes are zero-indexed.)\n"; GArffRelation* pRel = (GArffRelation*)data.relation().get(); for(size_t i = 0; i < as.ranks().size(); i++) cout << originalIndices.at(as.ranks()[i]) << " " << pRel->attrName(as.ranks()[i]) << "\n"; }
void Extrapolate(GArgReader &args) { // Load the model if(args.size() < 1) { throw Ex("Model not specified."); } GDom doc; doc.loadJson(args.pop_string()); GLearnerLoader ll(true); GSupervisedLearner *pLearner = ll.loadLearner(doc.root()); std::unique_ptr<GSupervisedLearner> hLearner(pLearner); // Parse options double start = 1.0; double length = 1.0; double step = 0.0002; bool useFeatures = false; bool outputFeatures = true; GNeuralDecomposition *nd = (GNeuralDecomposition *) pLearner; std::unique_ptr<GMatrix> hFeatures; while(args.next_is_flag()) { if(args.if_pop("-start")) { start = args.pop_double(); } else if(args.if_pop("-length")) { length = args.pop_double(); } else if(args.if_pop("-step")) { step = args.pop_double(); } else if(args.if_pop("-features")) { LoadData(args, hFeatures); useFeatures = true; } else if(args.if_pop("-outputFeatures")) { outputFeatures = true; } else { throw Ex("Invalid option: ", args.peek()); } } // Extrapolate GMatrix *pOutput; if(useFeatures) pOutput = nd->extrapolate(*hFeatures.get()); else pOutput = nd->extrapolate(start, length, step, outputFeatures); std::unique_ptr<GMatrix> hOutput(pOutput); // Output predictions pOutput->print(cout); }
void Train(GArgReader &args) { // Load series from file std::unique_ptr<GMatrix> hSeries, hFeatures; LoadData(args, hSeries); GMatrix *pSeries = hSeries.get(); // Split features/labels if(pSeries->cols() == 2) { GMatrix *pFeatures = pSeries->cloneSub(0, 0, pSeries->rows(), 1); GMatrix *pLabels = pSeries->cloneSub(0, 1, pSeries->rows(), 1); hFeatures.reset(pFeatures); hSeries.reset(pLabels); pSeries = pLabels; } else if(pSeries->cols() > 2) { throw Ex("Too many columns!"); } // Parse options GNeuralDecomposition *nd = new GNeuralDecomposition(); while(args.next_is_flag()) { if(args.if_pop("-regularization")) nd->setRegularization(args.pop_double()); else if(args.if_pop("-learningRate")) nd->setLearningRate(args.pop_double()); else if(args.if_pop("-linearUnits")) nd->setLinearUnits(args.pop_uint()); else if(args.if_pop("-softplusUnits")) nd->setSoftplusUnits(args.pop_uint()); else if(args.if_pop("-sigmoidUnits")) nd->setSigmoidUnits(args.pop_uint()); else if(args.if_pop("-epochs")) nd->setEpochs(args.pop_uint()); else if(args.if_pop("-features")) LoadData(args, hFeatures); else if(args.if_pop("-filterLogarithm")) nd->setFilterLogarithm(true); else throw Ex("Invalid option: ", args.peek()); } if(hFeatures.get() == NULL) { // Generate features GMatrix *pFeatures = new GMatrix(pSeries->rows(), 1); for(size_t i = 0; i < pSeries->rows(); i++) { pFeatures->row(i)[0] = i / (double) pSeries->rows(); } hFeatures.reset(pFeatures); } // Train GMatrix *pFeatures = hFeatures.get(); nd->train(*pFeatures, *pSeries); // Output the trained model GDom doc; doc.setRoot(nd->serialize(&doc)); doc.writeJson(cout); }
void GRecommenderLib::fillMissingValues(GArgReader& args) { unsigned int seed = getpid() * (unsigned int)time(NULL); bool normalize = true; while(args.next_is_flag()) { if(args.if_pop("-seed")) seed = args.pop_uint(); else if(args.if_pop("-nonormalize")) normalize = false; else throw Ex("Invalid option: ", args.peek()); } // Load the data and the filter GMatrix dataOrig; dataOrig.loadArff(args.pop_string()); // Parse params vector<size_t> ignore; while(args.next_is_flag()) { if(args.if_pop("-ignore")) parseAttributeList(ignore, args, dataOrig.cols()); else throw Ex("Invalid option: ", args.peek()); } // Throw out the ignored attributes std::sort(ignore.begin(), ignore.end()); for(size_t i = ignore.size() - 1; i < ignore.size(); i--) dataOrig.deleteColumns(ignore[i], 1); GRelation* pOrigRel = dataOrig.relation().clone(); std::unique_ptr<GRelation> hOrigRel(pOrigRel); GCollaborativeFilter* pModel = InstantiateAlgorithm(args); std::unique_ptr<GCollaborativeFilter> hModel(pModel); if(args.size() > 0) throw Ex("Superfluous argument: ", args.peek()); pModel->rand().setSeed(seed); // Convert to all normalized real values GNominalToCat* pNtc = new GNominalToCat(); GIncrementalTransform* pFilter = pNtc; std::unique_ptr<GIncrementalTransformChainer> hChainer; if(normalize) { GIncrementalTransformChainer* pChainer = new GIncrementalTransformChainer(new GNormalize(), pNtc); hChainer.reset(pChainer); pFilter = pChainer; } pNtc->preserveUnknowns(); pFilter->train(dataOrig); GMatrix* pData = pFilter->transformBatch(dataOrig); std::unique_ptr<GMatrix> hData(pData); // Convert to 3-column form GMatrix* pMatrix = new GMatrix(0, 3); std::unique_ptr<GMatrix> hMatrix(pMatrix); size_t dims = pData->cols(); for(size_t i = 0; i < pData->rows(); i++) { GVec& row = pData->row(i); for(size_t j = 0; j < dims; j++) { if(row[j] != UNKNOWN_REAL_VALUE) { GVec& vec = pMatrix->newRow(); vec[0] = (double)i; vec[1] = (double)j; vec[2] = row[j]; } } } // Train the collaborative filter pModel->train(*pMatrix); hMatrix.release(); pMatrix = NULL; // Predict values for missing elements for(size_t i = 0; i < pData->rows(); i++) { GVec& row = pData->row(i); for(size_t j = 0; j < dims; j++) { if(row[j] == UNKNOWN_REAL_VALUE) row[j] = pModel->predict(i, j); GAssert(row[j] != UNKNOWN_REAL_VALUE); } } // Convert the data back to its original form GMatrix* pOut = pFilter->untransformBatch(*pData); pOut->setRelation(hOrigRel.release()); pOut->print(cout); }
void selfOrganizingMap(GArgReader& args){ // Load the file GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); // Parse arguments std::vector<double> netDims; unsigned numNodes = 1; while(args.next_is_uint()){ unsigned dim = args.pop_uint(); netDims.push_back(dim); numNodes *= dim; } if(netDims.size() < 1){ throw Ex("No dimensions specified for self organizing map. ", "A map must be at least 1 dimensional."); } Holder<SOM::ReporterChain> reporters(new SOM::ReporterChain); Holder<SOM::TrainingAlgorithm> alg(NULL); Holder<GDistanceMetric> weightDist(new GRowDistance); Holder<GDistanceMetric> nodeDist(new GRowDistance); Holder<SOM::NodeLocationInitialization> topology(new SOM::GridTopology); Holder<SOM::NodeWeightInitialization> weightInit (new SOM::NodeWeightInitializationTrainingSetSample(NULL)); Holder<SOM::NeighborhoodWindowFunction> windowFunc(new SOM::GaussianWindowFunction()); //Loading and saving string loadFrom = ""; string saveTo = ""; //Parameters for different training algorithms string algoName = "batch"; double startWidth = -1;//Start width - set later if still negative double endWidth = -1;//End width - set later if still negative double startRate = -1;//Start learning rate double endRate = -1;//End learning rate unsigned numIter = 100;//Total iterations unsigned numConverge = 1;//#steps for batch to converge while(args.next_is_flag()){ if(args.if_pop("-tofile")){ saveTo = args.pop_string(); }else if(args.if_pop("-fromfile")){ loadFrom = args.pop_string(); }else if(args.if_pop("-seed")){ GRand::global().setSeed(args.pop_uint()); }else if(args.if_pop("-neighborhood")){ string name = args.pop_string(); if(name == "gaussian"){ windowFunc.reset(new SOM::GaussianWindowFunction()); }else if(name == "uniform"){ windowFunc.reset(new SOM::UniformWindowFunction()); }else{ throw Ex("Only gaussian and uniform are acceptible ", "neighborhood types"); } }else if(args.if_pop("-printMeshEvery")){ using namespace SOM; unsigned interval = args.pop_uint(); string baseFilename = args.pop_string(); unsigned xDim = args.pop_uint(); unsigned yDim = args.pop_uint(); bool showTrain = false; if(args.if_pop("showTrain") || args.if_pop("showtrain")){ showTrain = true; } smart_ptr<Reporter> weightReporter (new SVG2DWeightReporter(baseFilename, xDim, yDim, showTrain)); Holder<IterationIntervalReporter> intervalReporter (new IterationIntervalReporter(weightReporter, interval)); reporters->add(intervalReporter.release()); }else if(args.if_pop("-batchTrain")){ algoName = "batch"; startWidth = args.pop_double(); endWidth = args.pop_double(); numIter = args.pop_uint(); numConverge = args.pop_uint(); }else if(args.if_pop("-stdTrain")){ algoName = "standard"; startWidth = args.pop_double(); endWidth = args.pop_double(); startRate = args.pop_double(); endRate = args.pop_double(); numIter = args.pop_uint(); }else{ throw Ex("Invalid option: ", args.peek()); } } //Create the training algorithm Holder<SOM::TrainingAlgorithm> algo; if(algoName == "batch"){ double netRadius = *std::max_element(netDims.begin(), netDims.end()); if(startWidth < 0){ startWidth = 2*netRadius; } if(endWidth < 0){ endWidth = 1; } algo.reset( new SOM::BatchTraining (startWidth, endWidth, numIter, numConverge, weightInit.release(), windowFunc.release(), reporters.release())); }else if(algoName == "standard"){ algo.reset( new SOM::TraditionalTraining (startWidth, endWidth, startRate, endRate, numIter, weightInit.release(), windowFunc.release(), reporters.release())); }else{ throw Ex("Unknown type of training algorithm: \"", algoName, "\""); } //Create the network & transform the data Holder<GSelfOrganizingMap> som; Holder<GMatrix> out; if(loadFrom == ""){ //Create map from arguments given som.reset(new GSelfOrganizingMap (netDims, numNodes, topology.release(), algo.release(), weightDist.release(), nodeDist.release())); //Train the network and transform the data in place out.reset(som->doit(*pData)); }else{ //Create map from file GDom source; source.loadJson(loadFrom.c_str()); som.reset(new GSelfOrganizingMap(source.root())); //Transform using the loaded network out.reset(som->transformBatch(*pData)); } //Save the trained network if(saveTo != ""){ GDom serialized; GDomNode* root = som->serialize(&serialized); serialized.setRoot(root); serialized.saveJson(saveTo.c_str()); } //Print the result out->print(cout); }
void principalComponentAnalysis(GArgReader& args) { // Load the file GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); int nTargetDims = args.pop_uint(); // Parse options string roundTrip; unsigned int seed = getpid() * (unsigned int)time(NULL); string eigenvalues; string components; string modelIn; string modelOut; bool aboutOrigin = false; while(args.next_is_flag()) { if(args.if_pop("-seed")) seed = args.pop_uint(); else if(args.if_pop("-roundtrip")) roundTrip = args.pop_string(); else if(args.if_pop("-eigenvalues")) eigenvalues = args.pop_string(); else if(args.if_pop("-components")) components = args.pop_string(); else if(args.if_pop("-aboutorigin")) aboutOrigin = true; else if(args.if_pop("-modelin")) modelIn = args.pop_string(); else if(args.if_pop("-modelout")) modelOut = args.pop_string(); else throw Ex("Invalid option: ", args.peek()); } // Transform the data GRand prng(seed); GPCA* pTransform = NULL; if(modelIn.length() > 0) { GDom doc; doc.loadJson(modelIn.c_str()); GLearnerLoader ll(prng); pTransform = new GPCA(doc.root(), ll); } else { pTransform = new GPCA(nTargetDims, &prng); if(aboutOrigin) pTransform->aboutOrigin(); if(eigenvalues.length() > 0) pTransform->computeEigVals(); pTransform->train(*pData); } Holder<GPCA> hTransform(pTransform); GMatrix* pDataAfter = pTransform->transformBatch(*pData); Holder<GMatrix> hDataAfter(pDataAfter); // Save the eigenvalues if(eigenvalues.length() > 0) { GArffRelation* pRelation = new GArffRelation(); pRelation->addAttribute("eigenvalues", 0, NULL); sp_relation pRel = pRelation; GMatrix dataEigenvalues(pRel); dataEigenvalues.newRows(nTargetDims); double* pEigVals = pTransform->eigVals(); for(int i = 0; i < nTargetDims; i++) dataEigenvalues[i][0] = pEigVals[i]; dataEigenvalues.saveArff(eigenvalues.c_str()); } // Save the components if(components.length() > 0) pTransform->components()->saveArff(components.c_str()); // Do the round-trip if(roundTrip.size() > 0) { GMatrix roundTripped(pData->rows(), pData->cols()); for(size_t i = 0; i < pData->rows(); i++) pTransform->untransform(pDataAfter->row(i), roundTripped.row(i)); roundTripped.saveArff(roundTrip.c_str()); } if(modelOut.length() > 0) { GDom doc; doc.setRoot(pTransform->serialize(&doc)); doc.saveJson(modelOut.c_str()); } pDataAfter->print(cout); }
void neuroPCA(GArgReader& args) { // Load the file GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); int nTargetDims = args.pop_uint(); // Parse options string roundTrip; unsigned int seed = getpid() * (unsigned int)time(NULL); bool trainBias = true; bool linear = false; string eigenvalues = ""; while(args.next_is_flag()) { if(args.if_pop("-seed")) seed = args.pop_uint(); else if(args.if_pop("-clampbias")) trainBias = false; else if(args.if_pop("-linear")) linear = true; else if(args.if_pop("-eigenvalues")) eigenvalues = args.pop_string(); else throw Ex("Invalid option: ", args.peek()); } // Transform the data GRand prng(seed); GNeuroPCA transform(nTargetDims, &prng); if(!trainBias) transform.clampBias(); if(linear) transform.setActivation(new GActivationIdentity()); if(eigenvalues.length() > 0) transform.computeEigVals(); GMatrix* pDataAfter = transform.doit(*pData); Holder<GMatrix> hDataAfter(pDataAfter); // Save the eigenvalues if(eigenvalues.length() > 0) { GArffRelation* pRelation = new GArffRelation(); pRelation->addAttribute("eigenvalues", 0, NULL); sp_relation pRel = pRelation; GMatrix dataEigenvalues(pRel); dataEigenvalues.newRows(nTargetDims); double* pEigVals = transform.eigVals(); for(int i = 0; i < nTargetDims; i++) dataEigenvalues[i][0] = pEigVals[i]; dataEigenvalues.saveArff(eigenvalues.c_str()); } // In linear mode, people usually expect normalized eigenvectors, so let's normalize them now if(linear) { GMatrix* pWeights = transform.weights(); GAssert(pWeights->cols() == pData->cols()); for(int i = 0; i < nTargetDims; i++) { double scal = sqrt(GVec::squaredMagnitude(pWeights->row(i + 1), pWeights->cols())); for(size_t j = 0; j < pDataAfter->rows(); j++) pDataAfter->row(j)[i] *= scal; } } pDataAfter->print(cout); }
void fillMissingValues(GArgReader& args) { unsigned int seed = getpid() * (unsigned int)time(NULL); while(args.next_is_flag()) { if(args.if_pop("-seed")) seed = args.pop_uint(); else ThrowError("Invalid option: ", args.peek()); } // Load the data and the filter GMatrix* pDataOrig = GMatrix::loadArff(args.pop_string()); Holder<GMatrix> hDataOrig(pDataOrig); sp_relation pOrigRel = pDataOrig->relation(); GRand prng(seed); GCollaborativeFilter* pModel = InstantiateAlgorithm(prng, args); Holder<GCollaborativeFilter> hModel(pModel); if(args.size() > 0) ThrowError("Superfluous argument: ", args.peek()); // Convert to all normalized real values GNominalToCat* pNtc = new GNominalToCat(); GTwoWayTransformChainer filter(new GNormalize(), pNtc); pNtc->preserveUnknowns(); filter.train(*pDataOrig); GMatrix* pData = filter.transformBatch(*pDataOrig); Holder<GMatrix> hData(pData); hDataOrig.release(); pDataOrig = NULL; // Convert to 3-column form GMatrix* pMatrix = new GMatrix(0, 3); Holder<GMatrix> hMatrix(pMatrix); size_t dims = pData->cols(); for(size_t i = 0; i < pData->rows(); i++) { double* pRow = pData->row(i); for(size_t j = 0; j < dims; j++) { if(*pRow != UNKNOWN_REAL_VALUE) { double* pVec = pMatrix->newRow(); pVec[0] = i; pVec[1] = j; pVec[2] = *pRow; } pRow++; } } // Train the collaborative filter pModel->train(*pMatrix); hMatrix.release(); pMatrix = NULL; // Predict values for missing elements for(size_t i = 0; i < pData->rows(); i++) { double* pRow = pData->row(i); for(size_t j = 0; j < dims; j++) { if(*pRow == UNKNOWN_REAL_VALUE) *pRow = pModel->predict(i, j); GAssert(*pRow != UNKNOWN_REAL_VALUE); pRow++; } } // Convert the data back to its original form GMatrix* pOut = filter.untransformBatch(*pData); pOut->setRelation(pOrigRel); pOut->print(cout); }
///Return a pointer to newly allocated data read from the command line ///represented by args. /// ///The returned matrix is allocated by new and it is the caller's ///responsibility to deallocate it. The suggested manner is to use a ///Holder<GMatrix*> /// ///In the returned matrix, all of the attributes designated as labels ///have been moved to the end and ignored attributes have been ///removed. The original indices of all the attributes are returned in ///originalIndices. /// ///\param args the command-line arguments /// ///\param pLabelDims (out parameter) the index of the first attribute ///which is designated a label. /// ///\param originalIndices the vector in which to place the original ///indices. originalIndices[i] is the index in the original data file ///of the attribute currently at index i. void loadDataWithSwitches(GMatrix& data, GArgReader& args, size_t& pLabelDims, std::vector<size_t>& originalIndices) { // Load the dataset by extension const char* szFilename = args.pop_string(); PathData pd; GFile::parsePath(szFilename, &pd); if(_stricmp(szFilename + pd.extStart, ".arff") == 0) data.loadArff(szFilename); else if(_stricmp(szFilename + pd.extStart, ".csv") == 0) data.loadCsv(szFilename, ',', false, false); else if(_stricmp(szFilename + pd.extStart, ".dat") == 0) data.loadCsv(szFilename, '\0', false, false); else throw Ex("Unsupported file format: ", szFilename + pd.extStart); //Make the initial list of original indices originalIndices.resize(data.cols()); for(std::size_t i = 0; i < originalIndices.size(); ++i){ originalIndices.at(i) = i; } // Parse params vector<size_t> ignore; vector<size_t> labels; while(args.next_is_flag()) { if(args.if_pop("-labels")) parseAttributeList(labels, args, data.cols()); else if(args.if_pop("-ignore")) parseAttributeList(ignore, args, data.cols()); else break; } // Throw out the ignored attributes std::sort(ignore.begin(), ignore.end()); for(size_t i = ignore.size() - 1; i < ignore.size(); i--) { data.deleteColumn(ignore[i]); originalIndices.erase(originalIndices.begin()+ignore[i]); for(size_t j = 0; j < labels.size(); j++) { if(labels[j] >= ignore[i]) { if(labels[j] == ignore[i]) throw Ex("Attribute ", to_str(labels[j]), " is both ignored and used as a label"); labels[j]--; } } } // Swap label columns to the end pLabelDims = std::max((size_t)1, labels.size()); for(size_t i = 0; i < labels.size(); i++) { size_t src = labels[i]; size_t dst = data.cols() - pLabelDims + i; if(src != dst) { data.swapColumns(src, dst); std::swap(originalIndices.at(src), originalIndices.at(dst)); for(size_t j = i + 1; j < labels.size(); j++) { if(labels[j] == dst) { labels[j] = src; break; } } } } }