void singularValueDecomposition(GArgReader& args) { // Load GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); // Parse options string ufilename = "u.arff"; string sigmafilename; string vfilename = "v.arff"; int maxIters = 100; while(args.size() > 0) { if(args.if_pop("-ufilename")) ufilename = args.pop_string(); else if(args.if_pop("-sigmafilename")) sigmafilename = args.pop_string(); else if(args.if_pop("-vfilename")) vfilename = args.pop_string(); else if(args.if_pop("-maxiters")) maxIters = args.pop_uint(); else ThrowError("Invalid option: ", args.peek()); } GMatrix* pU; double* pDiag; GMatrix* pV; pData->singularValueDecomposition(&pU, &pDiag, &pV, false, maxIters); Holder<GMatrix> hU(pU); ArrayHolder<double> hDiag(pDiag); Holder<GMatrix> hV(pV); pU->saveArff(ufilename.c_str()); pV->saveArff(vfilename.c_str()); if(sigmafilename.length() > 0) { GMatrix sigma(pU->rows(), pV->rows()); sigma.setAll(0.0); size_t m = std::min(sigma.rows(), (size_t)sigma.cols()); for(size_t i = 0; i < m; i++) sigma.row(i)[i] = pDiag[i]; sigma.saveArff(sigmafilename.c_str()); } else { GVec::print(cout, 14, pDiag, std::min(pU->rows(), pV->rows())); cout << "\n"; } }
void split(GArgReader& args) { // Load GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); int pats = (int)pData->rows() - args.pop_uint(); if(pats < 0) ThrowError("out of range. The data only has ", to_str(pData->rows()), " rows."); const char* szFilename1 = args.pop_string(); const char* szFilename2 = args.pop_string(); unsigned int nSeed = getpid() * (unsigned int)time(NULL); bool shouldShuffle = false; while(args.size() > 0){ if(args.if_pop("-shuffle")){ shouldShuffle = true; }else if(args.if_pop("-seed")){ nSeed = args.pop_uint(); }else ThrowError("Invalid option: ", args.peek()); } // Shuffle if necessary GRand rng(nSeed); if(shouldShuffle){ pData->shuffle(rng); } // Split GMatrix other(pData->relation()); pData->splitBySize(&other, pats); pData->saveArff(szFilename1); other.saveArff(szFilename2); }
void Loader::loadAirPassengerData(GMatrix &trainFeat, GMatrix &trainLab, GMatrix &testFeat, GMatrix &testLab) { GMatrix raw; raw.loadArff("data/air_passengers.arff"); size_t dims = 1; size_t offset = 0; size_t train_size = 72; size_t test_size = 72; double *x, *y; trainFeat.resize(train_size, 1); trainLab.resize(train_size, dims); testFeat.resize(test_size, 1); testLab.resize(test_size, dims); double log_10 = log(10); double vert_offset = 2; double scale = 10;//0.1; for(size_t i = 0; i < train_size + test_size; i++) { if(i < train_size) { x = trainFeat[i]; y = trainLab[i]; } else { x = testFeat[i - train_size]; y = testLab[i - train_size]; } *x = double(i) / train_size; *y = ((log(scale * raw[offset + i][0]) / log_10) - vert_offset); } trainLab.saveArff("out/train.arff"); testLab.saveArff("out/test.arff"); }
void Loader::loadOzoneData(GMatrix &trainFeat, GMatrix &trainLab, GMatrix &testFeat, GMatrix &testLab) { GMatrix raw; raw.loadArff("data/mhsets_monthly-ozone.arff"); size_t dims = 1; size_t offset = 0; size_t train_size = 108; size_t test_size = 44; double *x, *y; trainFeat.resize(train_size, 1); trainLab.resize(train_size, dims); testFeat.resize(test_size, 1); testLab.resize(test_size, dims); for(size_t i = 0; i < train_size + test_size; i++) { if(i < train_size) { x = trainFeat[i]; y = trainLab[i]; } else { x = testFeat[i - train_size]; y = testLab[i - train_size]; } *x = double(i) / train_size; *y = log(raw[offset + i][0]) / log(10); } trainLab.saveArff("out/train.arff"); testLab.saveArff("out/test.arff"); }
void attributeSelector(GArgReader& args) { // Load the data size_t labelDims; std::vector<size_t> originalIndices; GMatrix data; loadDataWithSwitches(data, args, labelDims, originalIndices); // Parse the options unsigned int seed = getpid() * (unsigned int)time(NULL); int targetFeatures = 1; string outFilename = ""; while(args.next_is_flag()) { if(args.if_pop("-seed")) seed = args.pop_uint(); else if(args.if_pop("-out")) { targetFeatures = args.pop_uint(); outFilename = args.pop_string(); } else throw Ex("Invalid neighbor finder option: ", args.peek()); } // Do the attribute selection GRand prng(seed); GAttributeSelector as(labelDims, targetFeatures, &prng); if(outFilename.length() > 0) { as.train(data); GMatrix* pDataOut = as.transformBatch(data); Holder<GMatrix> hDataOut(pDataOut); cout << "Reduced data saved to " << outFilename.c_str() << ".\n"; pDataOut->saveArff(outFilename.c_str()); } else as.train(data); cout << "\nAttribute rankings from most salient to least salient. (Attributes are zero-indexed.)\n"; GArffRelation* pRel = (GArffRelation*)data.relation().get(); for(size_t i = 0; i < as.ranks().size(); i++) cout << originalIndices.at(as.ranks()[i]) << " " << pRel->attrName(as.ranks()[i]) << "\n"; }