// virtual void GGaussianProcess::trainInner(const GMatrix& features, const GMatrix& labels) { if(!features.relation().areContinuous()) throw Ex("GGaussianProcess only supports continuous features. Perhaps you should wrap it in a GAutoFilter."); if(!labels.relation().areContinuous()) throw Ex("GGaussianProcess only supports continuous labels. Perhaps you should wrap it in a GAutoFilter."); if(features.rows() <= m_maxSamples) { trainInnerInner(features, labels); return; } GMatrix f(features.relation().clone()); GReleaseDataHolder hF(&f); GMatrix l(labels.relation().clone()); GReleaseDataHolder hL(&l); for(size_t i = 0; i < features.rows(); i++) { f.takeRow((GVec*)&features[i]); l.takeRow((GVec*)&labels[i]); } while(f.rows() > m_maxSamples) { size_t i = (size_t)m_rand.next(f.rows()); f.releaseRow(i); l.releaseRow(i); } trainInnerInner(f, l); }
void enumerateValues(GArgReader& args) { GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); size_t col = args.pop_uint(); if(pData->relation()->valueCount(col) > 0) ((GArffRelation*)pData->relation().get())->setAttrValueCount(col, 0); else { size_t n = 0; map<double,size_t> themap; for(size_t i = 0; i < pData->rows(); i++) { double* pRow = pData->row(i); map<double,size_t>::iterator it = themap.find(pRow[col]); if(it == themap.end()) { themap[pRow[col]] = n; pRow[col] = (double)n; n++; } else pRow[col] = (double)it->second; } } pData->print(cout); }
// virtual void GLinearRegressor::trainInner(const GMatrix& features, const GMatrix& labels) { if(!features.relation().areContinuous()) throw Ex("GLinearRegressor only supports continuous features. Perhaps you should wrap it in a GAutoFilter."); if(!labels.relation().areContinuous()) throw Ex("GLinearRegressor only supports continuous labels. Perhaps you should wrap it in a GAutoFilter."); // Use a fast, but not-very-numerically-stable technique to compute an initial approximation for beta and epsilon clear(); GMatrix* pAll = GMatrix::mergeHoriz(&features, &labels); Holder<GMatrix> hAll(pAll); GPCA pca(features.cols()); pca.train(*pAll); size_t inputs = features.cols(); size_t outputs = labels.cols(); GMatrix f(inputs, inputs); GMatrix l(inputs, outputs); for(size_t i = 0; i < inputs; i++) { GVec::copy(f[i].data(), pca.basis()->row(i).data(), inputs); double sqmag = f[i].squaredMagnitude(); if(sqmag > 1e-10) f[i] *= 1.0 / sqmag; l[i].set(pca.basis()->row(i).data() + inputs, outputs); } m_pBeta = GMatrix::multiply(l, f, true, false); m_epsilon.resize(outputs); GVecWrapper vw(pca.centroid().data(), m_pBeta->cols()); m_pBeta->multiply(vw.vec(), m_epsilon, false); m_epsilon *= -1.0; GVec::add(m_epsilon.data(), pca.centroid().data() + inputs, outputs); // Refine the results using gradient descent refine(features, labels, 0.06, 20, 0.75); }
void splitClass(GArgReader& args) { const char* filename = args.pop_string(); GMatrix* pData = loadData(filename); Holder<GMatrix> hData(pData); size_t classAttr = args.pop_uint(); bool dropClass = false; while(args.size() > 0) { if(args.if_pop("-dropclass")) dropClass = true; else ThrowError("Invalid option: ", args.peek()); } for(size_t i = 0; i < pData->relation()->valueCount(classAttr); i++) { GMatrix tmp(pData->relation(), pData->heap()); pData->splitByNominalValue(&tmp, classAttr, i); std::ostringstream oss; PathData pd; GFile::parsePath(filename, &pd); string fn; fn.assign(filename + pd.fileStart, pd.extStart - pd.fileStart); oss << fn << "_"; pData->relation()->printAttrValue(oss, classAttr, (double)i); oss << ".arff"; string s = oss.str(); if(dropClass) tmp.deleteColumn(classAttr); tmp.saveArff(s.c_str()); } }
void fillMissingValues(GArgReader& args) { // Load GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); // Parse options unsigned int nSeed = getpid() * (unsigned int)time(NULL); bool random = false; while(args.size() > 0) { if(args.if_pop("-seed")) nSeed = args.pop_uint(); else if(args.if_pop("-random")) random = true; else ThrowError("Invalid option: ", args.peek()); } // Replace missing values and print GRand prng(nSeed); if(random) { for(size_t i = 0; i < pData->relation()->size(); i++) pData->replaceMissingValuesRandomly(i, &prng); } else { for(size_t i = 0; i < pData->relation()->size(); i++) pData->replaceMissingValuesWithBaseline(i); } pData->print(cout); }
void test_transform_mergevert() { // Make some input files TempFileMaker tempFile1("a.arff", "@RELATION test\n" "@ATTRIBUTE a1 continuous\n" "@ATTRIBUTE a2 { alice, bob }\n" "@ATTRIBUTE a3 { true, false }\n" "@DATA\n" "1.2, alice, true\n" "2.3, bob, false\n" ); TempFileMaker tempFile2("b.arff", "@RELATION test\n" "@ATTRIBUTE a1 continuous\n" "@ATTRIBUTE a2 { charlie, bob }\n" "@ATTRIBUTE a3 { false, true }\n" "@DATA\n" "3.4, bob, true\n" "4.5, charlie, false\n" ); // Execute the command GPipe pipeStdOut; if(sysExec("waffles_transform", "mergevert a.arff b.arff", &pipeStdOut) != 0) throw Ex("exit status indicates failure"); char buf[512]; size_t len = pipeStdOut.read(buf, 512); if(len == 512) throw Ex("need a bigger buffer"); buf[len] = '\0'; // Check the results GMatrix M; M.parseArff(buf, strlen(buf)); if(M.rows() != 4 || M.cols() != 3) throw Ex("failed"); if(M.relation().valueCount(0) != 0) throw Ex("failed"); if(M.relation().valueCount(1) != 3) throw Ex("failed"); if(M.relation().valueCount(2) != 2) throw Ex("failed"); std::ostringstream oss; const GArffRelation* pRel = (const GArffRelation*)&M.relation(); pRel->printAttrValue(oss, 1, 2.0); string s = oss.str(); if(strcmp(s.c_str(), "charlie") != 0) throw Ex("failed"); if(M[0][0] != 1.2 || M[1][0] != 2.3 || M[2][0] != 3.4 || M[3][0] != 4.5) throw Ex("failed"); if(M[0][1] != 0 || M[1][1] != 1 || M[2][1] != 1 || M[3][1] != 2) throw Ex("failed"); if(M[0][2] != 0 || M[1][2] != 1 || M[2][2] != 0 || M[3][2] != 1) throw Ex("failed"); }
// virtual void GNaiveBayes::trainInner(const GMatrix& features, const GMatrix& labels) { if(!features.relation().areNominal()) throw Ex("GNaiveBayes only supports nominal features. Perhaps you should wrap it in a GAutoFilter."); if(!labels.relation().areNominal()) throw Ex("GNaiveBayes only supports nominal labels. Perhaps you should wrap it in a GAutoFilter."); beginIncrementalLearningInner(features.relation(), labels.relation()); for(size_t n = 0; n < features.rows(); n++) trainIncremental(features[n], labels[n]); }
// virtual void GNaiveInstance::trainInner(const GMatrix& features, const GMatrix& labels) { if(!features.relation().areContinuous()) throw Ex("GNaiveInstance only supports continuous features. Perhaps you should wrap it in a GAutoFilter."); if(!labels.relation().areContinuous()) throw Ex("GNaiveInstance only supports continuous labels. Perhaps you should wrap it in a GAutoFilter."); beginIncrementalLearningInner(features.relation(), labels.relation()); for(size_t i = 0; i < features.rows(); i++) trainIncremental(features[i], labels[i]); }
// virtual void GLinearDistribution::trainInner(const GMatrix& features, const GMatrix& labels) { if(!features.relation().areContinuous()) throw Ex("GLinearDistribution only supports continuous features. Perhaps you should wrap it in a GAutoFilter."); if(!labels.relation().areContinuous()) throw Ex("GLinearDistribution only supports continuous labels. Perhaps you should wrap it in a GAutoFilter."); // Init A with the inverse of the weights prior covariance matrix size_t dims = features.cols(); GMatrix a(dims, dims); a.setAll(0.0); // Init XY size_t labelDims = labels.cols(); GMatrix xy(dims, labelDims); xy.setAll(0.0); // Train on each instance double w = 1.0 / (m_noiseDev * m_noiseDev); for(size_t i = 0; i < features.rows(); i++) { // Update A const GVec& feat = features[i]; for(size_t j = 0; j < dims; j++) { GVec& el = a[j]; for(size_t k = 0; k < dims; k++) el[k] += feat[j] * feat[k]; } // Update XY const GVec& lab = labels[i]; for(size_t j = 0; j < dims; j++) { GVec& el = xy[j]; for(size_t k = 0; k < labelDims; k++) el[k] += feat[j] * lab[k]; } } a.multiply(w); xy.multiply(w); // Compute final matrices clear(); m_pAInv = a.pseudoInverse(); GAssert(m_pAInv->cols() == dims); GAssert(m_pAInv->rows() == dims); m_pWBar = GMatrix::multiply(xy, *m_pAInv, true, true); GAssert(m_pWBar->cols() == dims); GAssert(m_pWBar->rows() == labelDims); m_buf.resize(dims); }
GBagTrainWorker(GMasterThread& master, GBag* pBag, const GMatrix& features, const GMatrix& labels, double trainSize, size_t seed) : GWorkerThread(master), m_pBag(pBag), m_features(features), m_labels(labels), m_drawnFeatures(features.relation().clone()), m_drawnLabels(labels.relation().clone()), m_rand(seed) { GAssert(m_features.rows() > 0); m_drawSize = size_t(trainSize * features.rows()); m_drawnFeatures.reserve(m_drawSize); m_drawnLabels.reserve(m_drawSize); }
void DropMissingValues(GArgReader& args) { GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); GRelation* pRelation = pData->relation().get(); size_t dims = pRelation->size(); for(size_t i = pData->rows() - 1; i < pData->rows(); i--) { double* pPat = pData->row(i); bool drop = false; for(size_t j = 0; j < dims; j++) { if(pRelation->valueCount(j) == 0) { if(pPat[j] == UNKNOWN_REAL_VALUE) { drop = true; break; } } else { if(pPat[j] == UNKNOWN_DISCRETE_VALUE) { drop = true; break; } } } if(drop) pData->deleteRow(i); } pData->print(cout); }
void split(GArgReader& args) { // Load GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); int pats = (int)pData->rows() - args.pop_uint(); if(pats < 0) ThrowError("out of range. The data only has ", to_str(pData->rows()), " rows."); const char* szFilename1 = args.pop_string(); const char* szFilename2 = args.pop_string(); unsigned int nSeed = getpid() * (unsigned int)time(NULL); bool shouldShuffle = false; while(args.size() > 0){ if(args.if_pop("-shuffle")){ shouldShuffle = true; }else if(args.if_pop("-seed")){ nSeed = args.pop_uint(); }else ThrowError("Invalid option: ", args.peek()); } // Shuffle if necessary GRand rng(nSeed); if(shouldShuffle){ pData->shuffle(rng); } // Split GMatrix other(pData->relation()); pData->splitBySize(&other, pats); pData->saveArff(szFilename1); other.saveArff(szFilename2); }
void Discretize(GArgReader& args) { // Load the file GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); // Parse Options size_t nFirst = 0; size_t nLast = pData->relation()->size() - 1; size_t nBuckets = std::max(2, (int)floor(sqrt((double)pData->rows() + 0.5))); while(args.size() > 0) { if(args.if_pop("-buckets")) nBuckets = args.pop_uint(); else if(args.if_pop("-colrange")) { nFirst = args.pop_uint(); nLast = args.pop_uint(); } else ThrowError("Invalid option: ", args.peek()); } if(nFirst < 0 || nLast >= pData->relation()->size() || nLast < nFirst) ThrowError("column index out of range"); // Discretize the continuous attributes in the specified range for(size_t i = nFirst; i <= nLast; i++) { if(pData->relation()->valueCount(i) != 0) continue; double min, range; pData->minAndRange(i, &min, &range); for(size_t j = 0; j < pData->rows(); j++) { double* pPat = pData->row(j); pPat[i] = (double)std::max((size_t)0, std::min(nBuckets - 1, (size_t)floor(((pPat[i] - min) * nBuckets) / range))); } ((GArffRelation*)pData->relation().get())->setAttrValueCount(i, nBuckets); } // Print results pData->print(cout); }
void test_parsearff_quoting(){ const char* inputArff= "@relation 'squares of numbers'\n" "\n" "@attribute 'the number' real\n" "\n" "@attribute 'the square of the number' real\n" "\n" "@attribute exact {'is exact', inexact,is\\\\\\ exact}\n" "\n" "@data\n" "1,1,'is exact'\n" "2,4,is\\ exact\n" "1.414,2,inexact\n" "3,9,\"is exact\"\n" "4,16,\"is\\ exact\"\n" ; GMatrix M; M.parseArff(inputArff, strlen(inputArff)); double expected_data[5][3]={{1,1,0},{2,4,0},{1.414,2,1},{3,9,0},{4,16,2}}; const GArffRelation* pRel = (const GArffRelation*)&M.relation(); const GArffRelation& R = *pRel; TestEqual(R.size(), (std::size_t)3, "Incorrect number of attributes"); for(unsigned row = 0; row < 5; ++row){ for(unsigned col = 0; col < 3; ++col){ std::stringstream errdescr; errdescr << "Incorrect matrix entry [" << row << "][" << col << "]"; TestEqual(M[row][col], expected_data[row][col], errdescr.str()); } } TestEqual(true, R.areContinuous(0,2), "First or second attribute is not continuous"); TestEqual(true, R.areNominal(2,1), "Third attribute is not nominal"); std::stringstream val0, val1, val2; R.printAttrValue(val0, 2, 0); R.printAttrValue(val1, 2, 1); R.printAttrValue(val2, 2, 2); TestEqual("'is exact'",val0.str(), "First value of third attribute incorrect name"); TestEqual("inexact",val1.str(), "Second value of third attribute incorrect name"); TestEqual("is\\ exact",val2.str(), "Third value of third attribute incorrect name"); TestEqual("'the number'",R.attrName(0),"First attribute incorrect name"); TestEqual("'the square of the number'",R.attrName(1), "Second attribute incorrect name"); TestEqual("exact",R.attrName(2),"Third attribute incorrect name"); }
void splitFold(GArgReader& args) { // Load GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); size_t fold = args.pop_uint(); size_t folds = args.pop_uint(); if(fold >= folds) ThrowError("fold index out of range. It must be less than the total number of folds."); // Options string filenameTrain = "train.arff"; string filenameTest = "test.arff"; while(args.size() > 0) { if(args.if_pop("-out")) { filenameTrain = args.pop_string(); filenameTest = args.pop_string(); } else ThrowError("Invalid option: ", args.peek()); } // Copy relevant portions of the data GMatrix train(pData->relation()); GMatrix test(pData->relation()); size_t begin = pData->rows() * fold / folds; size_t end = pData->rows() * (fold + 1) / folds; for(size_t i = 0; i < begin; i++) train.copyRow(pData->row(i)); for(size_t i = begin; i < end; i++) test.copyRow(pData->row(i)); for(size_t i = end; i < pData->rows(); i++) train.copyRow(pData->row(i)); train.saveArff(filenameTrain.c_str()); test.saveArff(filenameTest.c_str()); }
void SwapAttributes(GArgReader& args) { GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); size_t nAttr1 = args.pop_uint(); size_t nAttr2 = args.pop_uint(); size_t attrCount = pData->relation()->size(); if(nAttr1 >= attrCount) ThrowError("Index out of range"); if(nAttr2 >= attrCount) ThrowError("Index out of range"); pData->swapColumns(nAttr1, nAttr2); pData->print(cout); }
void ManifoldSculpting(GArgReader& args) { // Load the file and params GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); unsigned int nSeed = getpid() * (unsigned int)time(NULL); GRand prng(nSeed); GNeighborFinder* pNF = instantiateNeighborFinder(pData, &prng, args); Holder<GNeighborFinder> hNF(pNF); size_t targetDims = args.pop_uint(); // Parse Options const char* szPreprocessedData = NULL; double scaleRate = 0.999; while(args.size() > 0) { if(args.if_pop("-seed")) prng.setSeed(args.pop_uint()); else if(args.if_pop("-continue")) szPreprocessedData = args.pop_string(); else if(args.if_pop("-scalerate")) scaleRate = args.pop_double(); else throw Ex("Invalid option: ", args.peek()); } // Load the hint data GMatrix* pDataHint = NULL; Holder<GMatrix> hDataHint(NULL); if(szPreprocessedData) { pDataHint = loadData(szPreprocessedData); hDataHint.reset(pDataHint); if(pDataHint->relation()->size() != targetDims) throw Ex("Wrong number of dims in the hint data"); if(pDataHint->rows() != pData->rows()) throw Ex("Wrong number of patterns in the hint data"); } // Transform the data GManifoldSculpting transform(pNF->neighborCount(), targetDims, &prng); transform.setSquishingRate(scaleRate); if(pDataHint) transform.setPreprocessedData(hDataHint.release()); transform.setNeighborFinder(pNF); GMatrix* pDataAfter = transform.doit(*pData); Holder<GMatrix> hDataAfter(pDataAfter); pDataAfter->print(cout); }
// virtual void GBag::trainInnerInner(GMatrix& features, GMatrix& labels) { // Train all the models size_t nLearnerCount = m_models.size(); size_t nDrawSize = size_t(m_trainSize * features.rows()); GMatrix drawnFeatures(features.relation()); GMatrix drawnLabels(labels.relation()); drawnFeatures.reserve(nDrawSize); drawnLabels.reserve(nDrawSize); { for(size_t i = 0; i < nLearnerCount; i++) { if(m_pCB) m_pCB(m_pThis, i, nLearnerCount); // Randomly draw some data (with replacement) GReleaseDataHolder hDrawnFeatures(&drawnFeatures); GReleaseDataHolder hDrawnLabels(&drawnLabels); for(size_t j = 0; j < nDrawSize; j++) { size_t r = (size_t)m_rand.next(features.rows()); drawnFeatures.takeRow(features[r]); drawnLabels.takeRow(labels[r]); } // Train the learner with the drawn data m_models[i]->m_pModel->train(drawnFeatures, drawnLabels); } if(m_pCB) m_pCB(m_pThis, nLearnerCount, nLearnerCount); } // Determine the weights determineWeights(features, labels); normalizeWeights(); }
void dropRandomValues(GArgReader& args) { GMatrix* pData = loadData(args.pop_string()); double portion = args.pop_double(); // Parse the options unsigned int seed = getpid() * (unsigned int)time(NULL); while(args.next_is_flag()) { if(args.if_pop("-seed")) seed = args.pop_uint(); else ThrowError("Invalid option: ", args.peek()); } GRand rand(seed); size_t n = pData->rows() * pData->cols(); size_t k = size_t(portion * n); for(size_t i = 0; i < pData->cols(); i++) { size_t vals = pData->relation()->valueCount(i); if(vals == 0) { for(size_t j = 0; j < pData->rows(); j++) { if(rand.next(n) < k) { pData->row(j)[i] = UNKNOWN_REAL_VALUE; k--; } n--; } } else { for(size_t j = 0; j < pData->rows(); j++) { if(rand.next(n) < k) { pData->row(j)[i] = UNKNOWN_DISCRETE_VALUE; k--; } n--; } } } pData->print(cout); }
void rotate(GArgReader& args) { GMatrix* pA = loadData(args.pop_string()); Holder<GMatrix> hA(pA); sp_relation relation = pA->relation(); unsigned colx = args.pop_uint(); if(colx >= pA->cols()){ ThrowError("Rotation first column index (",to_str(colx),") " "should not be greater " "than the largest index, which is ", to_str(pA->cols()-1), "."); } if(!relation->areContinuous(colx,1)){ ThrowError("Rotation first column index (",to_str(colx),") " "should be continuous and it is not."); } unsigned coly = args.pop_uint(); if(coly >= pA->cols()){ ThrowError("Rotation second column index (",to_str(coly),") " "should not be greater " "than the largest index, which is ", to_str(pA->cols()-1), "."); } if(!relation->areContinuous(coly,1)){ ThrowError("Rotation second column index (",to_str(coly),") " "should be continuous and it is not."); } double angle = args.pop_double(); angle = angle * M_PI / 180; //Convert from degrees to radians double cosAngle = std::cos(angle); double sinAngle = std::sin(angle); for(std::size_t rowIdx = 0; rowIdx < pA->rows(); ++rowIdx){ double* row = (*pA)[rowIdx]; double x = row[colx]; double y = row[coly]; row[colx]=x*cosAngle-y*sinAngle; row[coly]=x*sinAngle+y*cosAngle; } pA->print(cout); }
void attributeSelector(GArgReader& args) { // Load the data size_t labelDims; std::vector<size_t> originalIndices; GMatrix data; loadDataWithSwitches(data, args, labelDims, originalIndices); // Parse the options unsigned int seed = getpid() * (unsigned int)time(NULL); int targetFeatures = 1; string outFilename = ""; while(args.next_is_flag()) { if(args.if_pop("-seed")) seed = args.pop_uint(); else if(args.if_pop("-out")) { targetFeatures = args.pop_uint(); outFilename = args.pop_string(); } else throw Ex("Invalid neighbor finder option: ", args.peek()); } // Do the attribute selection GRand prng(seed); GAttributeSelector as(labelDims, targetFeatures, &prng); if(outFilename.length() > 0) { as.train(data); GMatrix* pDataOut = as.transformBatch(data); Holder<GMatrix> hDataOut(pDataOut); cout << "Reduced data saved to " << outFilename.c_str() << ".\n"; pDataOut->saveArff(outFilename.c_str()); } else as.train(data); cout << "\nAttribute rankings from most salient to least salient. (Attributes are zero-indexed.)\n"; GArffRelation* pRel = (GArffRelation*)data.relation().get(); for(size_t i = 0; i < as.ranks().size(); i++) cout << originalIndices.at(as.ranks()[i]) << " " << pRel->attrName(as.ranks()[i]) << "\n"; }
// virtual void GEnsemble::trainInner(const GMatrix& features, const GMatrix& labels) { delete(m_pLabelRel); m_pLabelRel = labels.relation().clone(); // Make the accumulator buffer size_t labelDims = m_pLabelRel->size(); size_t nAccumulatorDims = 0; for(size_t i = 0; i < labelDims; i++) { size_t nValues = m_pLabelRel->valueCount(i); if(nValues > 0) nAccumulatorDims += nValues; else nAccumulatorDims += 2; // mean and variance } m_accumulator.resize(nAccumulatorDims); trainInnerInner(features, labels); }
// virtual void GEnsemble::trainInner(GMatrix& features, GMatrix& labels) { m_pLabelRel = labels.relation(); // Make the accumulator buffer size_t labelDims = m_pLabelRel->size(); m_nAccumulatorDims = 0; for(size_t i = 0; i < labelDims; i++) { size_t nValues = m_pLabelRel->valueCount(i); if(nValues > 0) m_nAccumulatorDims += nValues; else m_nAccumulatorDims += 2; // mean and variance } delete[] m_pAccumulator; m_pAccumulator = new double[m_nAccumulatorDims]; trainInnerInner(features, labels); }
// virtual void GNaiveBayes::trainSparse(GSparseMatrix& features, GMatrix& labels) { if(features.rows() != labels.rows()) throw Ex("Expected the features and labels to have the same number of rows"); size_t featureDims = features.cols(); GUniformRelation featureRel(featureDims, 2); beginIncrementalLearning(featureRel, labels.relation()); GVec fullRow(featureDims); for(size_t n = 0; n < features.rows(); n++) { features.fullRow(fullRow, n); for(size_t i = 0; i < featureDims; i++) { if(fullRow[i] < 1e-6) fullRow[i] = 0.0; else fullRow[i] = 1.0; } trainIncremental(fullRow, labels[n]); } }
void transition(GArgReader& args) { // Load the input data GMatrix* pActions = loadData(args.pop_string()); Holder<GMatrix> hActions(pActions); GMatrix* pState = loadData(args.pop_string()); Holder<GMatrix> hState(pState); if(pState->rows() != pActions->rows()) ThrowError("Expected the same number of rows in both datasets"); // Parse options bool delta = false; while(args.size() > 0) { if(args.if_pop("-delta")) delta = true; else ThrowError("Invalid option: ", args.peek()); } // Make the output data size_t actionDims = pActions->cols(); size_t stateDims = pState->cols(); GMixedRelation* pRelation = new GMixedRelation(); sp_relation pRel = pRelation; pRelation->addAttrs(pActions->relation().get()); pRelation->addAttrs(stateDims + stateDims, 0); GMatrix* pTransition = new GMatrix(pRel); pTransition->newRows(pActions->rows() - 1); for(size_t i = 0; i < pActions->rows() - 1; i++) { double* pOut = pTransition->row(i); GVec::copy(pOut, pActions->row(i), actionDims); GVec::copy(pOut + actionDims, pState->row(i), stateDims); GVec::copy(pOut + actionDims + stateDims, pState->row(i + 1), stateDims); if(delta) GVec::subtract(pOut + actionDims + stateDims, pState->row(i), stateDims); } pTransition->print(cout); }
void Import(GArgReader& args) { // Load the file size_t len; const char* filename = args.pop_string(); char* pFile = GFile::loadFile(filename, &len); ArrayHolder<char> hFile(pFile); // Parse Options char separator = ','; bool tolerant = false; bool columnNamesInFirstRow = false; while(args.size() > 0) { if(args.if_pop("-tab")) separator = '\t'; else if(args.if_pop("-space")) separator = ' '; else if(args.if_pop("-whitespace")) separator = '\0'; else if(args.if_pop("-semicolon")) separator = ';'; else if(args.if_pop("-separator")) separator = args.pop_string()[0]; else if(args.if_pop("-tolerant")) tolerant = true; else if(args.if_pop("-columnnames")) columnNamesInFirstRow = true; else ThrowError("Invalid option: ", args.peek()); } // Parse the file GMatrix* pData = GMatrix::parseCsv(pFile, len, separator, columnNamesInFirstRow, tolerant); Holder<GMatrix> hData(pData); ((GArffRelation*)pData->relation().get())->setName(filename); // Print the data pData->print(cout); }
void Export(GArgReader& args) { // Load GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); // Parse options const char* separator = ","; while(args.size() > 0) { if(args.if_pop("-tab")) separator = " "; else if(args.if_pop("-space")) separator = " "; else ThrowError("Invalid option: ", args.peek()); } // Print for(size_t i = 0; i < pData->rows(); i++) pData->relation()->printRow(cout, pData->row(i), separator); }
void aggregateRows(GArgReader& args) { size_t r = args.pop_uint(); vector<string> files; GFile::fileList(files); GMatrix* pResults = NULL; Holder<GMatrix> hResults; for(vector<string>::iterator it = files.begin(); it != files.end(); it++) { PathData pd; GFile::parsePath(it->c_str(), &pd); if(strcmp(it->c_str() + pd.extStart, ".arff") != 0) continue; GMatrix* pData = loadData(it->c_str()); Holder<GMatrix> hData(pData); if(!pResults) { pResults = new GMatrix(pData->relation()); hResults.reset(pResults); } pResults->takeRow(pData->releaseRow(r)); } pResults->print(cout); }
void SortByAttribute(GArgReader& args) { GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); size_t nAttr = args.pop_uint(); size_t attrCount = pData->relation()->size(); if(nAttr >= attrCount) ThrowError("Index out of range"); // Parse options bool descending = false; while(args.size() > 0) { if(args.if_pop("-descending")) descending = true; else ThrowError("Invalid option: ", args.peek()); } pData->sort(nAttr); if(descending) pData->reverseRows(); pData->print(cout); }
// virtual void GResamplingAdaBoost::trainInnerInner(const GMatrix& features, const GMatrix& labels) { clear(); // Initialize all instances with uniform weights GVec pDistribution(features.rows()); pDistribution.fill(1.0 / features.rows()); size_t drawRows = size_t(m_trainSize * features.rows()); size_t* pDrawnIndexes = new size_t[drawRows]; std::unique_ptr<size_t[]> hDrawnIndexes(pDrawnIndexes); // Train the ensemble size_t labelDims = labels.cols(); double penalty = 1.0 / labelDims; GVec prediction(labelDims); for(size_t es = 0; es < m_ensembleSize; es++) { // Draw a training set from the distribution GCategoricalSamplerBatch csb(features.rows(), pDistribution, m_rand); csb.draw(drawRows, pDrawnIndexes); GMatrix drawnFeatures(features.relation().clone()); GReleaseDataHolder hDrawnFeatures(&drawnFeatures); GMatrix drawnLabels(labels.relation().clone()); GReleaseDataHolder hDrawnLabels(&drawnLabels); size_t* pIndex = pDrawnIndexes; for(size_t i = 0; i < drawRows; i++) { drawnFeatures.takeRow((GVec*)&features[*pIndex]); drawnLabels.takeRow((GVec*)&labels[*pIndex]); pIndex++; } // Train an instance of the model and store a clone of it m_pLearner->train(drawnFeatures, drawnLabels); GDom doc; GSupervisedLearner* pClone = m_pLoader->loadLearner(m_pLearner->serialize(&doc)); // Compute model weight double err = 0.5; for(size_t i = 0; i < features.rows(); i++) { pClone->predict(features[i], prediction); const GVec& target = labels[i]; for(size_t j = 0; j < labelDims; j++) { if((int)target[j] != (int)prediction[j]) err += penalty; } } err /= features.rows(); if(err >= 0.5) { delete(pClone); break; } double weight = 0.5 * log((1.0 - err) / err); m_models.push_back(new GWeightedModel(weight, pClone)); // Update the distribution to favor mis-classified instances for(size_t i = 0; i < features.rows(); i++) { err = 0.0; pClone->predict(features[i], prediction); const GVec& target = labels[i]; for(size_t j = 0; j < labelDims; j++) { if((int)target[j] != (int)prediction[j]) err += penalty; } err /= labelDims; pDistribution[i] *= exp(weight * (err * 2.0 - 1.0)); } pDistribution.sumToOne(); } normalizeWeights(); }