void split(GArgReader& args) { // Load GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); int pats = (int)pData->rows() - args.pop_uint(); if(pats < 0) ThrowError("out of range. The data only has ", to_str(pData->rows()), " rows."); const char* szFilename1 = args.pop_string(); const char* szFilename2 = args.pop_string(); unsigned int nSeed = getpid() * (unsigned int)time(NULL); bool shouldShuffle = false; while(args.size() > 0){ if(args.if_pop("-shuffle")){ shouldShuffle = true; }else if(args.if_pop("-seed")){ nSeed = args.pop_uint(); }else ThrowError("Invalid option: ", args.peek()); } // Shuffle if necessary GRand rng(nSeed); if(shouldShuffle){ pData->shuffle(rng); } // Split GMatrix other(pData->relation()); pData->splitBySize(&other, pats); pData->saveArff(szFilename1); other.saveArff(szFilename2); }
void AddIndexAttribute(GArgReader& args) { // Parse args const char* filename = args.pop_string(); double nStartValue = 0.0; double nIncrement = 1.0; while(args.size() > 0) { if(args.if_pop("-start")) nStartValue = args.pop_double(); else if(args.if_pop("-increment")) nIncrement = args.pop_double(); else ThrowError("Invalid option: ", args.peek()); } GMatrix* pData = loadData(filename); Holder<GMatrix> hData(pData); GArffRelation* pIndexRelation = new GArffRelation(); pIndexRelation->addAttribute("index", 0, NULL); sp_relation pIndexRel = pIndexRelation; GMatrix indexes(pIndexRel); indexes.newRows(pData->rows()); for(size_t i = 0; i < pData->rows(); i++) indexes.row(i)[0] = nStartValue + i * nIncrement; GMatrix* pUnified = GMatrix::mergeHoriz(&indexes, pData); Holder<GMatrix> hUnified(pUnified); pUnified->print(cout); }
void neighbors(GArgReader& args) { // Load the data GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); int neighborCount = args.pop_uint(); // Find the neighbors GKdTree neighborFinder(pData, neighborCount, NULL, true); GTEMPBUF(size_t, neighbors, neighborCount); GTEMPBUF(double, distances, neighborCount); double sumClosest = 0; double sumAll = 0; for(size_t i = 0; i < pData->rows(); i++) { neighborFinder.neighbors(neighbors, distances, i); neighborFinder.sortNeighbors(neighbors, distances); sumClosest += sqrt(distances[0]); for(int j = 0; j < neighborCount; j++) sumAll += sqrt(distances[j]); } cout.precision(14); cout << "average closest neighbor distance = " << (sumClosest / pData->rows()) << "\n"; cout << "average neighbor distance = " << (sumAll / (pData->rows() * neighborCount)) << "\n"; }
void DropMissingValues(GArgReader& args) { GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); GRelation* pRelation = pData->relation().get(); size_t dims = pRelation->size(); for(size_t i = pData->rows() - 1; i < pData->rows(); i--) { double* pPat = pData->row(i); bool drop = false; for(size_t j = 0; j < dims; j++) { if(pRelation->valueCount(j) == 0) { if(pPat[j] == UNKNOWN_REAL_VALUE) { drop = true; break; } } else { if(pPat[j] == UNKNOWN_DISCRETE_VALUE) { drop = true; break; } } } if(drop) pData->deleteRow(i); } pData->print(cout); }
// virtual void GGaussianProcess::trainInner(const GMatrix& features, const GMatrix& labels) { if(!features.relation().areContinuous()) throw Ex("GGaussianProcess only supports continuous features. Perhaps you should wrap it in a GAutoFilter."); if(!labels.relation().areContinuous()) throw Ex("GGaussianProcess only supports continuous labels. Perhaps you should wrap it in a GAutoFilter."); if(features.rows() <= m_maxSamples) { trainInnerInner(features, labels); return; } GMatrix f(features.relation().clone()); GReleaseDataHolder hF(&f); GMatrix l(labels.relation().clone()); GReleaseDataHolder hL(&l); for(size_t i = 0; i < features.rows(); i++) { f.takeRow((GVec*)&features[i]); l.takeRow((GVec*)&labels[i]); } while(f.rows() > m_maxSamples) { size_t i = (size_t)m_rand.next(f.rows()); f.releaseRow(i); l.releaseRow(i); } trainInnerInner(f, l); }
/***********************************************************************//** * @brief GMatrix to GSymMatrix storage class convertor * * @param[in] matrix General matrix (GMatrix). * * @exception GException::matrix_not_symmetric * Matrix is not symmetric. * * Converts a general matrix into the symmetric storage class. If the input * matrix is not symmetric, an exception is thrown. ***************************************************************************/ GSymMatrix::GSymMatrix(const GMatrix& matrix) { // Initialise class members for clean destruction init_members(); // Allocate matrix memory alloc_members(matrix.rows(), matrix.cols()); // Fill matrix for (int col = 0; col < matrix.cols(); ++col) { for (int row = col; row < matrix.rows(); ++row) { double value_ll = matrix(row,col); double value_ur = matrix(col,row); if (value_ll != value_ur) { throw GException::matrix_not_symmetric(G_CAST_MATRIX, matrix.rows(), matrix.cols()); } (*this)(row, col) = matrix(row, col); } } // Return return; }
void autoCorrelation(GArgReader& args) { GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); size_t lag = std::min((size_t)256, pData->rows() / 2); size_t dims = pData->cols(); GTEMPBUF(double, mean, dims); pData->centroid(mean); GMatrix ac(0, dims + 1); for(size_t i = 1; i <= lag; i++) { double* pRow = ac.newRow(); *(pRow++) = (double)i; for(size_t j = 0; j < dims; j++) { *pRow = 0; size_t k; for(k = 0; k + i < pData->rows(); k++) { double* pA = pData->row(k); double* pB = pData->row(k + i); *pRow += (pA[j] - mean[j]) * (pB[j] - mean[j]); } *pRow /= k; pRow++; } } ac.print(cout); }
void plot_it(const char* filename, GNeuralNet& nn, GMatrix& trainFeat, GMatrix& trainLab, GMatrix& testFeat, GMatrix& testLab) { GSVG svg(1000, 500); double xmin = trainFeat[0][0]; double xmax = testFeat[testFeat.rows() - 1][0]; svg.newChart(xmin, std::min(trainLab.columnMin(0), testLab.columnMin(0)), xmax, std::max(trainLab.columnMax(0), testLab.columnMax(0))); svg.horizMarks(20); svg.vertMarks(20); double prevx = xmin; double prevy = 0.0; double step = (xmax - xmin) / 500.0; GVec x(1); GVec y(1); for(x[0] = prevx; x[0] < xmax; x[0] += step) { nn.predict(x, y); if(prevx != x[0]) svg.line(prevx, prevy, x[0], y[0], 0.3); prevx = x[0]; prevy = y[0]; } for(size_t i = 0; i < trainLab.rows(); i++) svg.dot(trainFeat[i][0], trainLab[i][0], 0.4, 0xff000080); for(size_t i = 0; i < testLab.rows(); i++) svg.dot(testFeat[i][0], testLab[i][0], 0.4, 0xff800000); std::ofstream ofs; ofs.open(filename); svg.print(ofs); }
/// Compute the anticipated belief vector that will result if the specified plan is executed. void TransitionModel::getFinalBeliefs(const GVec& beliefs, const GMatrix& plan, GVec& outFinalBeliefs) { if(plan.rows() > 0) anticipateNextBeliefs(beliefs, plan[0], outFinalBeliefs); for(size_t i = 1; i < plan.rows(); i++) { anticipateNextBeliefs(outFinalBeliefs, plan[i], outFinalBeliefs); } }
void dropRows(GArgReader& args) { GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); size_t newSize = args.pop_uint(); while(pData->rows() > newSize) pData->deleteRow(pData->rows() - 1); pData->print(cout); }
void squaredDistance(GArgReader& args) { GMatrix* pA = loadData(args.pop_string()); Holder<GMatrix> hA(pA); GMatrix* pB = loadData(args.pop_string()); Holder<GMatrix> hB(pB); double d = pA->sumSquaredDifference(*pB, false); cout << "Sum squared distance: " << d << "\n"; cout << "Mean squared distance: " << (d / pA->rows()) << "\n"; cout << "Root mean squared distance: " << sqrt(d / pA->rows()) << "\n"; }
void GNaiveBayes_testMath() { const char* trainFile = "@RELATION test\n" "@ATTRIBUTE a {t,f}\n" "@ATTRIBUTE b {r,g,b}\n" "@ATTRIBUTE c {y,n}\n" "@DATA\n" "t,r,y\n" "f,r,n\n" "t,g,y\n" "f,g,y\n" "f,g,n\n" "t,r,n\n" "t,r,y\n" "t,b,y\n" "f,r,y\n" "f,g,n\n" "f,b,y\n" "t,r,n\n"; GMatrix train; train.parseArff(trainFile, strlen(trainFile)); GMatrix* pFeatures = train.cloneSub(0, 0, train.rows(), 2); std::unique_ptr<GMatrix> hFeatures(pFeatures); GMatrix* pLabels = train.cloneSub(0, 2, train.rows(), 1); std::unique_ptr<GMatrix> hLabels(pLabels); GNaiveBayes nb; nb.setEquivalentSampleSize(0.0); nb.train(*pFeatures, *pLabels); GPrediction out; GVec pat(2); pat[0] = 0; pat[1] = 0; nb.predictDistribution(pat, &out); GNaiveBayes_CheckResults(7.0/12.0, 4.0/7.0*3.0/7.0, 5.0/12.0, 2.0/5.0*3.0/5.0, &out); pat[0] = 0; pat[1] = 1; nb.predictDistribution(pat, &out); GNaiveBayes_CheckResults(7.0/12.0, 4.0/7.0*2.0/7.0, 5.0/12.0, 2.0/5.0*2.0/5.0, &out); pat[0] = 0; pat[1] = 2; nb.predictDistribution(pat, &out); GNaiveBayes_CheckResults(7.0/12.0, 4.0/7.0*2.0/7.0, 5.0/12.0, 2.0/5.0*0.0/5.0, &out); pat[0] = 1; pat[1] = 0; nb.predictDistribution(pat, &out); GNaiveBayes_CheckResults(7.0/12.0, 3.0/7.0*3.0/7.0, 5.0/12.0, 3.0/5.0*3.0/5.0, &out); pat[0] = 1; pat[1] = 1; nb.predictDistribution(pat, &out); GNaiveBayes_CheckResults(7.0/12.0, 3.0/7.0*2.0/7.0, 5.0/12.0, 3.0/5.0*2.0/5.0, &out); pat[0] = 1; pat[1] = 2; nb.predictDistribution(pat, &out); GNaiveBayes_CheckResults(7.0/12.0, 3.0/7.0*2.0/7.0, 5.0/12.0, 3.0/5.0*0.0/5.0, &out); }
void ManifoldSculpting(GArgReader& args) { // Load the file and params GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); unsigned int nSeed = getpid() * (unsigned int)time(NULL); GRand prng(nSeed); GNeighborFinder* pNF = instantiateNeighborFinder(pData, &prng, args); Holder<GNeighborFinder> hNF(pNF); size_t targetDims = args.pop_uint(); // Parse Options const char* szPreprocessedData = NULL; double scaleRate = 0.999; while(args.size() > 0) { if(args.if_pop("-seed")) prng.setSeed(args.pop_uint()); else if(args.if_pop("-continue")) szPreprocessedData = args.pop_string(); else if(args.if_pop("-scalerate")) scaleRate = args.pop_double(); else throw Ex("Invalid option: ", args.peek()); } // Load the hint data GMatrix* pDataHint = NULL; Holder<GMatrix> hDataHint(NULL); if(szPreprocessedData) { pDataHint = loadData(szPreprocessedData); hDataHint.reset(pDataHint); if(pDataHint->relation()->size() != targetDims) throw Ex("Wrong number of dims in the hint data"); if(pDataHint->rows() != pData->rows()) throw Ex("Wrong number of patterns in the hint data"); } // Transform the data GManifoldSculpting transform(pNF->neighborCount(), targetDims, &prng); transform.setSquishingRate(scaleRate); if(pDataHint) transform.setPreprocessedData(hDataHint.release()); transform.setNeighborFinder(pNF); GMatrix* pDataAfter = transform.doit(*pData); Holder<GMatrix> hDataAfter(pDataAfter); pDataAfter->print(cout); }
void singularValueDecomposition(GArgReader& args) { // Load GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); // Parse options string ufilename = "u.arff"; string sigmafilename; string vfilename = "v.arff"; int maxIters = 100; while(args.size() > 0) { if(args.if_pop("-ufilename")) ufilename = args.pop_string(); else if(args.if_pop("-sigmafilename")) sigmafilename = args.pop_string(); else if(args.if_pop("-vfilename")) vfilename = args.pop_string(); else if(args.if_pop("-maxiters")) maxIters = args.pop_uint(); else ThrowError("Invalid option: ", args.peek()); } GMatrix* pU; double* pDiag; GMatrix* pV; pData->singularValueDecomposition(&pU, &pDiag, &pV, false, maxIters); Holder<GMatrix> hU(pU); ArrayHolder<double> hDiag(pDiag); Holder<GMatrix> hV(pV); pU->saveArff(ufilename.c_str()); pV->saveArff(vfilename.c_str()); if(sigmafilename.length() > 0) { GMatrix sigma(pU->rows(), pV->rows()); sigma.setAll(0.0); size_t m = std::min(sigma.rows(), (size_t)sigma.cols()); for(size_t i = 0; i < m; i++) sigma.row(i)[i] = pDiag[i]; sigma.saveArff(sigmafilename.c_str()); } else { GVec::print(cout, 14, pDiag, std::min(pU->rows(), pV->rows())); cout << "\n"; } }
void GLinearRegressor::refine(GMatrix& features, GMatrix& labels, double learningRate, size_t epochs, double learningRateDecayFactor) { size_t fDims = features.cols(); size_t lDims = labels.cols(); size_t* pIndexes = new size_t[features.rows()]; ArrayHolder<size_t> hIndexes(pIndexes); GIndexVec::makeIndexVec(pIndexes, features.rows()); for(size_t i = 0; i < epochs; i++) { GIndexVec::shuffle(pIndexes, features.rows(), &m_rand); size_t* pIndex = pIndexes; for(size_t j = 0; j < features.rows(); j++) { double* pFeat = features[*pIndex]; double* pLab = labels[*pIndex]; double* pBias = m_pEpsilon; for(size_t k = 0; k < lDims; k++) { double err = *pLab - (GVec::dotProduct(pFeat, m_pBeta->row(k), fDims) + *pBias); double* pF = pFeat; double lr = learningRate; double mag = 0.0; for(size_t l = 0; l < fDims; l++) { double d = *pF * err; mag += (d * d); pF++; } mag += err * err; if(mag > 1.0) lr /= mag; pF = pFeat; double* pW = m_pBeta->row(k); for(size_t l = 0; l < fDims; l++) { *pW += *pF * lr * err; pF++; pW++; } *pBias += learningRate * err; pLab++; pBias++; } pIndex++; } learningRate *= learningRateDecayFactor; } }
void dropRandomValues(GArgReader& args) { GMatrix* pData = loadData(args.pop_string()); double portion = args.pop_double(); // Parse the options unsigned int seed = getpid() * (unsigned int)time(NULL); while(args.next_is_flag()) { if(args.if_pop("-seed")) seed = args.pop_uint(); else ThrowError("Invalid option: ", args.peek()); } GRand rand(seed); size_t n = pData->rows() * pData->cols(); size_t k = size_t(portion * n); for(size_t i = 0; i < pData->cols(); i++) { size_t vals = pData->relation()->valueCount(i); if(vals == 0) { for(size_t j = 0; j < pData->rows(); j++) { if(rand.next(n) < k) { pData->row(j)[i] = UNKNOWN_REAL_VALUE; k--; } n--; } } else { for(size_t j = 0; j < pData->rows(); j++) { if(rand.next(n) < k) { pData->row(j)[i] = UNKNOWN_DISCRETE_VALUE; k--; } n--; } } } pData->print(cout); }
void enumerateValues(GArgReader& args) { GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); size_t col = args.pop_uint(); if(pData->relation()->valueCount(col) > 0) ((GArffRelation*)pData->relation().get())->setAttrValueCount(col, 0); else { size_t n = 0; map<double,size_t> themap; for(size_t i = 0; i < pData->rows(); i++) { double* pRow = pData->row(i); map<double,size_t>::iterator it = themap.find(pRow[col]); if(it == themap.end()) { themap[pRow[col]] = n; pRow[col] = (double)n; n++; } else pRow[col] = (double)it->second; } } pData->print(cout); }
void aggregateCols(GArgReader& args) { size_t c = args.pop_uint(); vector<string> files; GFile::fileList(files); GMatrix* pResults = NULL; Holder<GMatrix> hResults; size_t i = 0; for(vector<string>::iterator it = files.begin(); it != files.end(); it++) { PathData pd; GFile::parsePath(it->c_str(), &pd); if(strcmp(it->c_str() + pd.extStart, ".arff") != 0) continue; GMatrix* pData = loadData(it->c_str()); Holder<GMatrix> hData(pData); if(!pResults) { pResults = new GMatrix(pData->rows(), files.size()); hResults.reset(pResults); } pResults->copyColumns(i, pData, c, 1); i++; } pResults->print(cout); }
void addNoise(GArgReader& args) { GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); double dev = args.pop_double(); // Parse the options unsigned int seed = getpid() * (unsigned int)time(NULL); int excludeLast = 0; while(args.next_is_flag()) { if(args.if_pop("-seed")) seed = args.pop_uint(); else if(args.if_pop("-excludelast")) excludeLast = args.pop_uint(); else ThrowError("Invalid neighbor finder option: ", args.peek()); } GRand prng(seed); size_t cols = pData->cols() - excludeLast; for(size_t r = 0; r < pData->rows(); r++) { double* pRow = pData->row(r); for(size_t c = 0; c < cols; c++) *(pRow++) += dev * prng.normal(); } pData->print(cout); }
void test_transform_mergevert() { // Make some input files TempFileMaker tempFile1("a.arff", "@RELATION test\n" "@ATTRIBUTE a1 continuous\n" "@ATTRIBUTE a2 { alice, bob }\n" "@ATTRIBUTE a3 { true, false }\n" "@DATA\n" "1.2, alice, true\n" "2.3, bob, false\n" ); TempFileMaker tempFile2("b.arff", "@RELATION test\n" "@ATTRIBUTE a1 continuous\n" "@ATTRIBUTE a2 { charlie, bob }\n" "@ATTRIBUTE a3 { false, true }\n" "@DATA\n" "3.4, bob, true\n" "4.5, charlie, false\n" ); // Execute the command GPipe pipeStdOut; if(sysExec("waffles_transform", "mergevert a.arff b.arff", &pipeStdOut) != 0) throw Ex("exit status indicates failure"); char buf[512]; size_t len = pipeStdOut.read(buf, 512); if(len == 512) throw Ex("need a bigger buffer"); buf[len] = '\0'; // Check the results GMatrix M; M.parseArff(buf, strlen(buf)); if(M.rows() != 4 || M.cols() != 3) throw Ex("failed"); if(M.relation().valueCount(0) != 0) throw Ex("failed"); if(M.relation().valueCount(1) != 3) throw Ex("failed"); if(M.relation().valueCount(2) != 2) throw Ex("failed"); std::ostringstream oss; const GArffRelation* pRel = (const GArffRelation*)&M.relation(); pRel->printAttrValue(oss, 1, 2.0); string s = oss.str(); if(strcmp(s.c_str(), "charlie") != 0) throw Ex("failed"); if(M[0][0] != 1.2 || M[1][0] != 2.3 || M[2][0] != 3.4 || M[3][0] != 4.5) throw Ex("failed"); if(M[0][1] != 0 || M[1][1] != 1 || M[2][1] != 1 || M[3][1] != 2) throw Ex("failed"); if(M[0][2] != 0 || M[1][2] != 1 || M[2][2] != 0 || M[3][2] != 1) throw Ex("failed"); }
void Discretize(GArgReader& args) { // Load the file GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); // Parse Options size_t nFirst = 0; size_t nLast = pData->relation()->size() - 1; size_t nBuckets = std::max(2, (int)floor(sqrt((double)pData->rows() + 0.5))); while(args.size() > 0) { if(args.if_pop("-buckets")) nBuckets = args.pop_uint(); else if(args.if_pop("-colrange")) { nFirst = args.pop_uint(); nLast = args.pop_uint(); } else ThrowError("Invalid option: ", args.peek()); } if(nFirst < 0 || nLast >= pData->relation()->size() || nLast < nFirst) ThrowError("column index out of range"); // Discretize the continuous attributes in the specified range for(size_t i = nFirst; i <= nLast; i++) { if(pData->relation()->valueCount(i) != 0) continue; double min, range; pData->minAndRange(i, &min, &range); for(size_t j = 0; j < pData->rows(); j++) { double* pPat = pData->row(j); pPat[i] = (double)std::max((size_t)0, std::min(nBuckets - 1, (size_t)floor(((pPat[i] - min) * nBuckets) / range))); } ((GArffRelation*)pData->relation().get())->setAttrValueCount(i, nBuckets); } // Print results pData->print(cout); }
// virtual void GNaiveBayes::trainInner(const GMatrix& features, const GMatrix& labels) { if(!features.relation().areNominal()) throw Ex("GNaiveBayes only supports nominal features. Perhaps you should wrap it in a GAutoFilter."); if(!labels.relation().areNominal()) throw Ex("GNaiveBayes only supports nominal labels. Perhaps you should wrap it in a GAutoFilter."); beginIncrementalLearningInner(features.relation(), labels.relation()); for(size_t n = 0; n < features.rows(); n++) trainIncremental(features[n], labels[n]); }
// virtual void GNaiveInstance::trainInner(const GMatrix& features, const GMatrix& labels) { if(!features.relation().areContinuous()) throw Ex("GNaiveInstance only supports continuous features. Perhaps you should wrap it in a GAutoFilter."); if(!labels.relation().areContinuous()) throw Ex("GNaiveInstance only supports continuous labels. Perhaps you should wrap it in a GAutoFilter."); beginIncrementalLearningInner(features.relation(), labels.relation()); for(size_t i = 0; i < features.rows(); i++) trainIncremental(features[i], labels[i]); }
void test_recommend_fillmissingvalues() { // Make some input files TempFileMaker tempFile1("a.arff", "@RELATION test\n" "@ATTRIBUTE a1 { a, b, c }\n" "@ATTRIBUTE a2 continuous\n" "@ATTRIBUTE a3 { d, e, f }\n" "@ATTRIBUTE a4 { g, h, i }\n" "@DATA\n" "a, ?, f, i\n" "?, 2, ?, i\n" "b, ?, d, ?\n" "?, 4, ?, ?\n" "?, ?, e, g\n" "?, ?, e, ?\n" "a, ?, ?, h\n" "\n" ); // Execute the command GPipe pipeStdOut; if(sysExec("waffles_recommend", "fillmissingvalues a.arff baseline", &pipeStdOut) != 0) throw Ex("exit status indicates failure"); char buf[512]; size_t len = pipeStdOut.read(buf, 512); if(len == 512) throw Ex("need a bigger buffer"); buf[len] = '\0'; // Check the results GMatrix M; M.parseArff(buf, strlen(buf)); if(M.rows() != 7 || M.cols() != 4) throw Ex("failed"); if(M[0][0] != 0) throw Ex("failed"); if(M[0][1] != 3) throw Ex("failed"); if(M[1][1] != 2) throw Ex("failed"); if(M[2][1] != 3) throw Ex("failed"); if(M[3][3] != 2) throw Ex("failed"); if(M[4][0] != 0) throw Ex("failed"); if(M[5][1] != 3) throw Ex("failed"); if(M[6][2] != 1) throw Ex("failed"); if(M[6][3] != 1) throw Ex("failed"); }
void TransformData(const double* pVector) { m_transform.fromVector(pVector + m_attrs, m_attrs); for(size_t i = 0; i < m_pData2->rows(); i++) { double* pPatIn = m_pData2->row(i); double* pPatOut = m_transformed.row(i); m_transform.multiply(pPatIn, pPatOut); GVec::add(pPatOut, pVector, m_attrs); } }
// virtual void GBayesianModelCombination::determineWeights(GMatrix& features, GMatrix& labels) { double* pWeights = new double[m_models.size()]; ArrayHolder<double> hWeights(pWeights); GVec::setAll(pWeights, 0.0, m_models.size()); double sumWeight = 0.0; double maxLogProb = -1e38; for(size_t i = 0; i < m_samples; i++) { // Set weights randomly from a dirichlet distribution with unifrom probabilities for(vector<GWeightedModel*>::iterator it = m_models.begin(); it != m_models.end(); it++) (*it)->m_weight = m_rand.exponential(); normalizeWeights(); // Evaluate accuracy double d = 1.0 - (sumSquaredError(features, labels) / labels.rows()); double logProbEnsembleGivenData; if(d <= 0.0) logProbEnsembleGivenData = -1e38; else if(d == 1.0) logProbEnsembleGivenData = 0.0; else logProbEnsembleGivenData = features.rows() * (d * log(d) + (1.0 - d) * log(1.0 - d)); // Update the weights if(logProbEnsembleGivenData > maxLogProb) { GVec::multiply(pWeights, exp(maxLogProb - logProbEnsembleGivenData), m_models.size()); maxLogProb = logProbEnsembleGivenData; } double w = exp(logProbEnsembleGivenData - maxLogProb); GVec::multiply(pWeights, sumWeight / (sumWeight + w), m_models.size()); double* pW = pWeights; for(vector<GWeightedModel*>::iterator it = m_models.begin(); it != m_models.end(); it++) *(pW++) += w * (*it)->m_weight; sumWeight += w; } double* pW = pWeights; for(vector<GWeightedModel*>::iterator it = m_models.begin(); it != m_models.end(); it++) (*it)->m_weight = *(pW++); }
// virtual void GLinearDistribution::trainInner(GMatrix& features, GMatrix& labels) { // Init A with the inverse of the weights prior covariance matrix size_t dims = features.cols(); GMatrix a(dims, dims); a.setAll(0.0); // Init XY size_t labelDims = labels.cols(); GMatrix xy(dims, labelDims); xy.setAll(0.0); // Train on each instance double w = 1.0 / (m_noiseDev * m_noiseDev); for(size_t i = 0; i < features.rows(); i++) { // Update A double* pFeat = features[i]; for(size_t j = 0; j < dims; j++) { double* pEl = a[j]; for(size_t k = 0; k < dims; k++) { *pEl += pFeat[j] * pFeat[k]; pEl++; } } // Update XY double* pLab = labels[i]; for(size_t j = 0; j < dims; j++) { double* pEl = xy[j]; for(size_t k = 0; k < labelDims; k++) { *pEl += pFeat[j] * pLab[k]; pEl++; } } } a.multiply(w); xy.multiply(w); // Compute final matrices clear(); m_pAInv = a.pseudoInverse(); GAssert(m_pAInv->cols() == dims); GAssert(m_pAInv->rows() == dims); m_pWBar = GMatrix::multiply(xy, *m_pAInv, true, true); GAssert(m_pWBar->cols() == dims); GAssert(m_pWBar->rows() == labelDims); m_pBuf = new double[dims]; }
// virtual void GPolynomial::trainInner(GMatrix& features, GMatrix& labels) { GMatrix labelCol(labels.rows(), 1); clear(); for(size_t i = 0; i < labels.cols(); i++) { GPolynomialSingleLabel* pPSL = new GPolynomialSingleLabel(m_controlPoints); m_polys.push_back(pPSL); labelCol.copyColumns(0, &labels, i, 1); pPSL->train(features, labelCol); } }
void LoadData(GArgReader &args, std::unique_ptr<GMatrix> &hOutput) { // Load the dataset by extension if(args.size() < 1) throw Ex("Expected the filename of a datset. (Found end of arguments.)"); const char* szFilename = args.pop_string(); PathData pd; GFile::parsePath(szFilename, &pd); GMatrix data; vector<size_t> abortedCols; vector<size_t> ambiguousCols; const char *input_type; if (args.next_is_flag() && args.if_pop("-input_type")) { input_type = args.pop_string(); } else { /* deduce it from extension (if any) */ input_type = szFilename + pd.extStart; if (*input_type != '.') /* no extension - assume ARFF */ input_type = "arff"; else input_type++; } // Now load the data if(_stricmp(input_type, "arff") == 0) { data.loadArff(szFilename); } else if(_stricmp(input_type, "csv") == 0) { GCSVParser parser; parser.parse(data, szFilename); cerr << "\nParsing Report:\n"; for(size_t i = 0; i < data.cols(); i++) cerr << to_str(i) << ") " << parser.report(i) << "\n"; } else if(_stricmp(input_type, "dat") == 0) { GCSVParser parser; parser.setSeparator('\0'); parser.parse(data, szFilename); cerr << "\nParsing Report:\n"; for(size_t i = 0; i < data.cols(); i++) cerr << to_str(i) << ") " << parser.report(i) << "\n"; } else { throw Ex("Unsupported file format: ", szFilename + pd.extStart); } // Split data into a feature matrix and a label matrix GMatrix* pFeatures = data.cloneSub(0, 0, data.rows(), data.cols()); hOutput.reset(pFeatures); }
// virtual void GBayesianModelCombination::determineWeights(const GMatrix& features, const GMatrix& labels) { GQUICKVEC(weights, m_models.size()); weights.fill(0.0); double sumWeight = 0.0; double maxLogProb = -1e38; for(size_t i = 0; i < m_samples; i++) { // Set weights randomly from a dirichlet distribution with unifrom probabilities for(vector<GWeightedModel*>::iterator it = m_models.begin(); it != m_models.end(); it++) (*it)->m_weight = m_rand.exponential(); normalizeWeights(); // Evaluate accuracy double d = 1.0 - (sumSquaredError(features, labels) / labels.rows()); double logProbEnsembleGivenData; if(d <= 0.0) logProbEnsembleGivenData = -1e38; else if(d == 1.0) logProbEnsembleGivenData = 0.0; else logProbEnsembleGivenData = features.rows() * (d * log(d) + (1.0 - d) * log(1.0 - d)); // Update the weights if(logProbEnsembleGivenData > maxLogProb) { weights *= exp(maxLogProb - logProbEnsembleGivenData); maxLogProb = logProbEnsembleGivenData; } double w = exp(logProbEnsembleGivenData - maxLogProb); weights *= (sumWeight / (sumWeight + w)); size_t pos = 0; for(vector<GWeightedModel*>::iterator it = m_models.begin(); it != m_models.end(); it++) weights[pos++] += (w * (*it)->m_weight); sumWeight += w; } size_t pos = 0; for(vector<GWeightedModel*>::iterator it = m_models.begin(); it != m_models.end(); it++) (*it)->m_weight = weights[pos++]; }