// virtual void GLinearRegressor::trainInner(const GMatrix& features, const GMatrix& labels) { if(!features.relation().areContinuous()) throw Ex("GLinearRegressor only supports continuous features. Perhaps you should wrap it in a GAutoFilter."); if(!labels.relation().areContinuous()) throw Ex("GLinearRegressor only supports continuous labels. Perhaps you should wrap it in a GAutoFilter."); // Use a fast, but not-very-numerically-stable technique to compute an initial approximation for beta and epsilon clear(); GMatrix* pAll = GMatrix::mergeHoriz(&features, &labels); Holder<GMatrix> hAll(pAll); GPCA pca(features.cols()); pca.train(*pAll); size_t inputs = features.cols(); size_t outputs = labels.cols(); GMatrix f(inputs, inputs); GMatrix l(inputs, outputs); for(size_t i = 0; i < inputs; i++) { GVec::copy(f[i].data(), pca.basis()->row(i).data(), inputs); double sqmag = f[i].squaredMagnitude(); if(sqmag > 1e-10) f[i] *= 1.0 / sqmag; l[i].set(pca.basis()->row(i).data() + inputs, outputs); } m_pBeta = GMatrix::multiply(l, f, true, false); m_epsilon.resize(outputs); GVecWrapper vw(pca.centroid().data(), m_pBeta->cols()); m_pBeta->multiply(vw.vec(), m_epsilon, false); m_epsilon *= -1.0; GVec::add(m_epsilon.data(), pca.centroid().data() + inputs, outputs); // Refine the results using gradient descent refine(features, labels, 0.06, 20, 0.75); }
void loadData(GMatrix& m, const char* szFilename) { // Load the dataset by extension PathData pd; GFile::parsePath(szFilename, &pd); if(_stricmp(szFilename + pd.extStart, ".arff") == 0) m.loadArff(szFilename); else if(_stricmp(szFilename + pd.extStart, ".csv") == 0) { GCSVParser parser; parser.parse(m, szFilename); cerr << "\nParsing Report:\n"; for(size_t i = 0; i < m.cols(); i++) cerr << to_str(i) << ") " << parser.report(i) << "\n"; } else if(_stricmp(szFilename + pd.extStart, ".dat") == 0) { GCSVParser parser; parser.setSeparator('\0'); parser.parse(m, szFilename); cerr << "\nParsing Report:\n"; for(size_t i = 0; i < m.cols(); i++) cerr << to_str(i) << ") " << parser.report(i) << "\n"; } else throw Ex("Unsupported file format: ", szFilename + pd.extStart); }
// virtual void GLinearRegressor::trainInner(GMatrix& features, GMatrix& labels) { // Use a fast, but not-very-numerically-stable technique to compute an initial approximation for beta and epsilon clear(); GMatrix* pAll = GMatrix::mergeHoriz(&features, &labels); Holder<GMatrix> hAll(pAll); GPCA pca(features.cols(), &m_rand); pca.train(*pAll); size_t inputs = features.cols(); size_t outputs = labels.cols(); GMatrix f(inputs, inputs); GMatrix l(inputs, outputs); for(size_t i = 0; i < inputs; i++) { GVec::copy(f[i], pca.basis(i), inputs); double sqmag = GVec::squaredMagnitude(f[i], inputs); if(sqmag > 1e-10) GVec::multiply(f[i], 1.0 / sqmag, inputs); GVec::copy(l[i], pca.basis(i) + inputs, outputs); } m_pBeta = GMatrix::multiply(l, f, true, false); m_pEpsilon = new double[outputs]; m_pBeta->multiply(pca.mean(), m_pEpsilon, false); GVec::multiply(m_pEpsilon, -1.0, outputs); GVec::add(m_pEpsilon, pca.mean() + inputs, outputs); // Refine the results using gradient descent refine(features, labels, 0.06, 20, 0.75); }
/***********************************************************************//** * @brief GMatrix to GSymMatrix storage class convertor * * @param[in] matrix General matrix (GMatrix). * * @exception GException::matrix_not_symmetric * Matrix is not symmetric. * * Converts a general matrix into the symmetric storage class. If the input * matrix is not symmetric, an exception is thrown. ***************************************************************************/ GSymMatrix::GSymMatrix(const GMatrix& matrix) { // Initialise class members for clean destruction init_members(); // Allocate matrix memory alloc_members(matrix.rows(), matrix.cols()); // Fill matrix for (int col = 0; col < matrix.cols(); ++col) { for (int row = col; row < matrix.rows(); ++row) { double value_ll = matrix(row,col); double value_ur = matrix(col,row); if (value_ll != value_ur) { throw GException::matrix_not_symmetric(G_CAST_MATRIX, matrix.rows(), matrix.cols()); } (*this)(row, col) = matrix(row, col); } } // Return return; }
void GPolynomialSingleLabel::train(GMatrix& features, GMatrix& labels) { GAssert(labels.cols() == 1); init(features.cols()); GPolynomialRegressCritic critic(this, features, labels); //GStochasticGreedySearch search(&critic); GMomentumGreedySearch search(&critic); search.searchUntil(100, 30, .01); setCoefficients(search.currentVector()); fromBezierCoefficients(); }
void LoadData(GArgReader &args, std::unique_ptr<GMatrix> &hOutput) { // Load the dataset by extension if(args.size() < 1) throw Ex("Expected the filename of a datset. (Found end of arguments.)"); const char* szFilename = args.pop_string(); PathData pd; GFile::parsePath(szFilename, &pd); GMatrix data; vector<size_t> abortedCols; vector<size_t> ambiguousCols; const char *input_type; if (args.next_is_flag() && args.if_pop("-input_type")) { input_type = args.pop_string(); } else { /* deduce it from extension (if any) */ input_type = szFilename + pd.extStart; if (*input_type != '.') /* no extension - assume ARFF */ input_type = "arff"; else input_type++; } // Now load the data if(_stricmp(input_type, "arff") == 0) { data.loadArff(szFilename); } else if(_stricmp(input_type, "csv") == 0) { GCSVParser parser; parser.parse(data, szFilename); cerr << "\nParsing Report:\n"; for(size_t i = 0; i < data.cols(); i++) cerr << to_str(i) << ") " << parser.report(i) << "\n"; } else if(_stricmp(input_type, "dat") == 0) { GCSVParser parser; parser.setSeparator('\0'); parser.parse(data, szFilename); cerr << "\nParsing Report:\n"; for(size_t i = 0; i < data.cols(); i++) cerr << to_str(i) << ") " << parser.report(i) << "\n"; } else { throw Ex("Unsupported file format: ", szFilename + pd.extStart); } // Split data into a feature matrix and a label matrix GMatrix* pFeatures = data.cloneSub(0, 0, data.rows(), data.cols()); hOutput.reset(pFeatures); }
// virtual void GLinearDistribution::trainInner(GMatrix& features, GMatrix& labels) { // Init A with the inverse of the weights prior covariance matrix size_t dims = features.cols(); GMatrix a(dims, dims); a.setAll(0.0); // Init XY size_t labelDims = labels.cols(); GMatrix xy(dims, labelDims); xy.setAll(0.0); // Train on each instance double w = 1.0 / (m_noiseDev * m_noiseDev); for(size_t i = 0; i < features.rows(); i++) { // Update A double* pFeat = features[i]; for(size_t j = 0; j < dims; j++) { double* pEl = a[j]; for(size_t k = 0; k < dims; k++) { *pEl += pFeat[j] * pFeat[k]; pEl++; } } // Update XY double* pLab = labels[i]; for(size_t j = 0; j < dims; j++) { double* pEl = xy[j]; for(size_t k = 0; k < labelDims; k++) { *pEl += pFeat[j] * pLab[k]; pEl++; } } } a.multiply(w); xy.multiply(w); // Compute final matrices clear(); m_pAInv = a.pseudoInverse(); GAssert(m_pAInv->cols() == dims); GAssert(m_pAInv->rows() == dims); m_pWBar = GMatrix::multiply(xy, *m_pAInv, true, true); GAssert(m_pWBar->cols() == dims); GAssert(m_pWBar->rows() == labelDims); m_pBuf = new double[dims]; }
// virtual void GLinearDistribution::trainInner(const GMatrix& features, const GMatrix& labels) { if(!features.relation().areContinuous()) throw Ex("GLinearDistribution only supports continuous features. Perhaps you should wrap it in a GAutoFilter."); if(!labels.relation().areContinuous()) throw Ex("GLinearDistribution only supports continuous labels. Perhaps you should wrap it in a GAutoFilter."); // Init A with the inverse of the weights prior covariance matrix size_t dims = features.cols(); GMatrix a(dims, dims); a.setAll(0.0); // Init XY size_t labelDims = labels.cols(); GMatrix xy(dims, labelDims); xy.setAll(0.0); // Train on each instance double w = 1.0 / (m_noiseDev * m_noiseDev); for(size_t i = 0; i < features.rows(); i++) { // Update A const GVec& feat = features[i]; for(size_t j = 0; j < dims; j++) { GVec& el = a[j]; for(size_t k = 0; k < dims; k++) el[k] += feat[j] * feat[k]; } // Update XY const GVec& lab = labels[i]; for(size_t j = 0; j < dims; j++) { GVec& el = xy[j]; for(size_t k = 0; k < labelDims; k++) el[k] += feat[j] * lab[k]; } } a.multiply(w); xy.multiply(w); // Compute final matrices clear(); m_pAInv = a.pseudoInverse(); GAssert(m_pAInv->cols() == dims); GAssert(m_pAInv->rows() == dims); m_pWBar = GMatrix::multiply(xy, *m_pAInv, true, true); GAssert(m_pWBar->cols() == dims); GAssert(m_pWBar->rows() == labelDims); m_buf.resize(dims); }
void dropRandomValues(GArgReader& args) { GMatrix* pData = loadData(args.pop_string()); double portion = args.pop_double(); // Parse the options unsigned int seed = getpid() * (unsigned int)time(NULL); while(args.next_is_flag()) { if(args.if_pop("-seed")) seed = args.pop_uint(); else ThrowError("Invalid option: ", args.peek()); } GRand rand(seed); size_t n = pData->rows() * pData->cols(); size_t k = size_t(portion * n); for(size_t i = 0; i < pData->cols(); i++) { size_t vals = pData->relation()->valueCount(i); if(vals == 0) { for(size_t j = 0; j < pData->rows(); j++) { if(rand.next(n) < k) { pData->row(j)[i] = UNKNOWN_REAL_VALUE; k--; } n--; } } else { for(size_t j = 0; j < pData->rows(); j++) { if(rand.next(n) < k) { pData->row(j)[i] = UNKNOWN_DISCRETE_VALUE; k--; } n--; } } } pData->print(cout); }
void GLinearRegressor::refine(GMatrix& features, GMatrix& labels, double learningRate, size_t epochs, double learningRateDecayFactor) { size_t fDims = features.cols(); size_t lDims = labels.cols(); size_t* pIndexes = new size_t[features.rows()]; ArrayHolder<size_t> hIndexes(pIndexes); GIndexVec::makeIndexVec(pIndexes, features.rows()); for(size_t i = 0; i < epochs; i++) { GIndexVec::shuffle(pIndexes, features.rows(), &m_rand); size_t* pIndex = pIndexes; for(size_t j = 0; j < features.rows(); j++) { double* pFeat = features[*pIndex]; double* pLab = labels[*pIndex]; double* pBias = m_pEpsilon; for(size_t k = 0; k < lDims; k++) { double err = *pLab - (GVec::dotProduct(pFeat, m_pBeta->row(k), fDims) + *pBias); double* pF = pFeat; double lr = learningRate; double mag = 0.0; for(size_t l = 0; l < fDims; l++) { double d = *pF * err; mag += (d * d); pF++; } mag += err * err; if(mag > 1.0) lr /= mag; pF = pFeat; double* pW = m_pBeta->row(k); for(size_t l = 0; l < fDims; l++) { *pW += *pF * lr * err; pF++; pW++; } *pBias += learningRate * err; pLab++; pBias++; } pIndex++; } learningRate *= learningRateDecayFactor; } }
void autoCorrelation(GArgReader& args) { GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); size_t lag = std::min((size_t)256, pData->rows() / 2); size_t dims = pData->cols(); GTEMPBUF(double, mean, dims); pData->centroid(mean); GMatrix ac(0, dims + 1); for(size_t i = 1; i <= lag; i++) { double* pRow = ac.newRow(); *(pRow++) = (double)i; for(size_t j = 0; j < dims; j++) { *pRow = 0; size_t k; for(k = 0; k + i < pData->rows(); k++) { double* pA = pData->row(k); double* pB = pData->row(k + i); *pRow += (pA[j] - mean[j]) * (pB[j] - mean[j]); } *pRow /= k; pRow++; } } ac.print(cout); }
void addNoise(GArgReader& args) { GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); double dev = args.pop_double(); // Parse the options unsigned int seed = getpid() * (unsigned int)time(NULL); int excludeLast = 0; while(args.next_is_flag()) { if(args.if_pop("-seed")) seed = args.pop_uint(); else if(args.if_pop("-excludelast")) excludeLast = args.pop_uint(); else ThrowError("Invalid neighbor finder option: ", args.peek()); } GRand prng(seed); size_t cols = pData->cols() - excludeLast; for(size_t r = 0; r < pData->rows(); r++) { double* pRow = pData->row(r); for(size_t c = 0; c < cols; c++) *(pRow++) += dev * prng.normal(); } pData->print(cout); }
void curviness2(GArgReader& args) { GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); GNormalize norm; GMatrix* pDataNormalized = norm.doit(*pData); Holder<GMatrix> hDataNormalized(pDataNormalized); hData.reset(); pData = NULL; // Parse Options size_t maxEigs = 10; unsigned int seed = getpid() * (unsigned int)time(NULL); Holder<GMatrix> hControlData(NULL); while(args.size() > 0) { if(args.if_pop("-seed")) seed = args.pop_uint(); else if(args.if_pop("-maxeigs")) maxEigs = args.pop_uint(); else throw Ex("Invalid option: ", args.peek()); } GRand rand(seed); size_t targetDims = std::min(maxEigs, pDataNormalized->cols()); // Do linear PCA GNeuroPCA np1(targetDims, &rand); np1.setActivation(new GActivationIdentity()); np1.computeEigVals(); GMatrix* pResults1 = np1.doit(*pDataNormalized); Holder<GMatrix> hResults1(pResults1); double* pEigVals1 = np1.eigVals(); for(size_t i = 0; i + 1 < targetDims; i++) pEigVals1[i] = sqrt(pEigVals1[i]) - sqrt(pEigVals1[i + 1]); size_t max1 = GVec::indexOfMax(pEigVals1, targetDims - 1, &rand); double v1 = (double)max1; if(max1 > 0 && max1 + 2 < targetDims) v1 += (pEigVals1[max1 - 1] - pEigVals1[max1 + 1]) / (2.0 * (pEigVals1[max1 - 1] + pEigVals1[max1 + 1] - 2.0 * pEigVals1[max1])); // Do non-linear PCA GNeuroPCA np2(targetDims, &rand); np1.setActivation(new GActivationLogistic()); np2.computeEigVals(); GMatrix* pResults2 = np2.doit(*pDataNormalized); Holder<GMatrix> hResults2(pResults2); double* pEigVals2 = np2.eigVals(); for(size_t i = 0; i + 1 < targetDims; i++) pEigVals2[i] = sqrt(pEigVals2[i]) - sqrt(pEigVals2[i + 1]); size_t max2 = GVec::indexOfMax(pEigVals2, targetDims - 1, &rand); double v2 = (double)max2; if(max2 > 0 && max2 + 2 < targetDims) v2 += (pEigVals2[max2 - 1] - pEigVals2[max2 + 1]) / (2.0 * (pEigVals2[max2 - 1] + pEigVals2[max2 + 1] - 2.0 * pEigVals2[max2])); // Compute the difference in where the eigenvalues fall cout.precision(14); cout << (v1 - v2) << "\n"; }
void blendEmbeddings(GArgReader& args) { // Load the files and params GMatrix* pDataOrig = loadData(args.pop_string()); Holder<GMatrix> hDataOrig(pDataOrig); unsigned int seed = getpid() * (unsigned int)time(NULL); GRand prng(seed); GNeighborFinder* pNF = instantiateNeighborFinder(pDataOrig, &prng, args); Holder<GNeighborFinder> hNF(pNF); GMatrix* pDataA = loadData(args.pop_string()); Holder<GMatrix> hDataA(pDataA); GMatrix* pDataB = loadData(args.pop_string()); Holder<GMatrix> hDataB(pDataB); if(pDataA->rows() != pDataOrig->rows() || pDataB->rows() != pDataOrig->rows()) throw Ex("mismatching number of rows"); if(pDataA->cols() != pDataB->cols()) throw Ex("mismatching number of cols"); // Parse Options while(args.size() > 0) { if(args.if_pop("-seed")) prng.setSeed(args.pop_uint()); else throw Ex("Invalid option: ", args.peek()); } // Get a neighbor table if(!pNF->isCached()) { GNeighborFinderCacheWrapper* pNF2 = new GNeighborFinderCacheWrapper(hNF.release(), true); hNF.reset(pNF2); pNF = pNF2; } ((GNeighborFinderCacheWrapper*)pNF)->fillCache(); size_t* pNeighborTable = ((GNeighborFinderCacheWrapper*)pNF)->cache(); // Do the blending size_t startPoint = (size_t)prng.next(pDataA->rows()); double* pRatios = new double[pDataA->rows()]; ArrayHolder<double> hRatios(pRatios); GVec::setAll(pRatios, 0.5, pDataA->rows()); GMatrix* pDataC = GManifold::blendEmbeddings(pDataA, pRatios, pDataB, pNF->neighborCount(), pNeighborTable, startPoint); Holder<GMatrix> hDataC(pDataC); pDataC->print(cout); }
void test_transform_mergevert() { // Make some input files TempFileMaker tempFile1("a.arff", "@RELATION test\n" "@ATTRIBUTE a1 continuous\n" "@ATTRIBUTE a2 { alice, bob }\n" "@ATTRIBUTE a3 { true, false }\n" "@DATA\n" "1.2, alice, true\n" "2.3, bob, false\n" ); TempFileMaker tempFile2("b.arff", "@RELATION test\n" "@ATTRIBUTE a1 continuous\n" "@ATTRIBUTE a2 { charlie, bob }\n" "@ATTRIBUTE a3 { false, true }\n" "@DATA\n" "3.4, bob, true\n" "4.5, charlie, false\n" ); // Execute the command GPipe pipeStdOut; if(sysExec("waffles_transform", "mergevert a.arff b.arff", &pipeStdOut) != 0) throw Ex("exit status indicates failure"); char buf[512]; size_t len = pipeStdOut.read(buf, 512); if(len == 512) throw Ex("need a bigger buffer"); buf[len] = '\0'; // Check the results GMatrix M; M.parseArff(buf, strlen(buf)); if(M.rows() != 4 || M.cols() != 3) throw Ex("failed"); if(M.relation().valueCount(0) != 0) throw Ex("failed"); if(M.relation().valueCount(1) != 3) throw Ex("failed"); if(M.relation().valueCount(2) != 2) throw Ex("failed"); std::ostringstream oss; const GArffRelation* pRel = (const GArffRelation*)&M.relation(); pRel->printAttrValue(oss, 1, 2.0); string s = oss.str(); if(strcmp(s.c_str(), "charlie") != 0) throw Ex("failed"); if(M[0][0] != 1.2 || M[1][0] != 2.3 || M[2][0] != 3.4 || M[3][0] != 4.5) throw Ex("failed"); if(M[0][1] != 0 || M[1][1] != 1 || M[2][1] != 1 || M[3][1] != 2) throw Ex("failed"); if(M[0][2] != 0 || M[1][2] != 1 || M[2][2] != 0 || M[3][2] != 1) throw Ex("failed"); }
void rotate(GArgReader& args) { GMatrix* pA = loadData(args.pop_string()); Holder<GMatrix> hA(pA); sp_relation relation = pA->relation(); unsigned colx = args.pop_uint(); if(colx >= pA->cols()){ ThrowError("Rotation first column index (",to_str(colx),") " "should not be greater " "than the largest index, which is ", to_str(pA->cols()-1), "."); } if(!relation->areContinuous(colx,1)){ ThrowError("Rotation first column index (",to_str(colx),") " "should be continuous and it is not."); } unsigned coly = args.pop_uint(); if(coly >= pA->cols()){ ThrowError("Rotation second column index (",to_str(coly),") " "should not be greater " "than the largest index, which is ", to_str(pA->cols()-1), "."); } if(!relation->areContinuous(coly,1)){ ThrowError("Rotation second column index (",to_str(coly),") " "should be continuous and it is not."); } double angle = args.pop_double(); angle = angle * M_PI / 180; //Convert from degrees to radians double cosAngle = std::cos(angle); double sinAngle = std::sin(angle); for(std::size_t rowIdx = 0; rowIdx < pA->rows(); ++rowIdx){ double* row = (*pA)[rowIdx]; double x = row[colx]; double y = row[coly]; row[colx]=x*cosAngle-y*sinAngle; row[coly]=x*sinAngle+y*cosAngle; } pA->print(cout); }
// virtual void GBayesianModelCombination::determineWeights(GMatrix& features, GMatrix& labels) { double* pWeights = new double[m_models.size()]; ArrayHolder<double> hWeights(pWeights); GVec::setAll(pWeights, 0.0, m_models.size()); double sumWeight = 0.0; double maxLogProb = -1e38; GTEMPBUF(double, results, labels.cols()); for(size_t i = 0; i < m_samples; i++) { // Set weights randomly from a dirichlet distribution with unifrom probabilities for(vector<GWeightedModel*>::iterator it = m_models.begin(); it != m_models.end(); it++) (*it)->m_weight = m_rand.exponential(); normalizeWeights(); // Evaluate accuracy accuracy(features, labels, results); double d = GVec::sumElements(results, labels.cols()) / labels.cols(); double logProbEnsembleGivenData; if(d == 0.0) logProbEnsembleGivenData = -1e38; else if(d == 1.0) logProbEnsembleGivenData = 0.0; else logProbEnsembleGivenData = features.rows() * (d * log(d) + (1.0 - d) * log(1.0 - d)); // Update the weights if(logProbEnsembleGivenData > maxLogProb) { GVec::multiply(pWeights, exp(maxLogProb - logProbEnsembleGivenData), m_models.size()); maxLogProb = logProbEnsembleGivenData; } double w = exp(logProbEnsembleGivenData - maxLogProb); GVec::multiply(pWeights, sumWeight / (sumWeight + w), m_models.size()); double* pW = pWeights; for(vector<GWeightedModel*>::iterator it = m_models.begin(); it != m_models.end(); it++) *(pW++) += w * (*it)->m_weight; sumWeight += w; } double* pW = pWeights; for(vector<GWeightedModel*>::iterator it = m_models.begin(); it != m_models.end(); it++) (*it)->m_weight = *(pW++); }
void test_recommend_fillmissingvalues() { // Make some input files TempFileMaker tempFile1("a.arff", "@RELATION test\n" "@ATTRIBUTE a1 { a, b, c }\n" "@ATTRIBUTE a2 continuous\n" "@ATTRIBUTE a3 { d, e, f }\n" "@ATTRIBUTE a4 { g, h, i }\n" "@DATA\n" "a, ?, f, i\n" "?, 2, ?, i\n" "b, ?, d, ?\n" "?, 4, ?, ?\n" "?, ?, e, g\n" "?, ?, e, ?\n" "a, ?, ?, h\n" "\n" ); // Execute the command GPipe pipeStdOut; if(sysExec("waffles_recommend", "fillmissingvalues a.arff baseline", &pipeStdOut) != 0) throw Ex("exit status indicates failure"); char buf[512]; size_t len = pipeStdOut.read(buf, 512); if(len == 512) throw Ex("need a bigger buffer"); buf[len] = '\0'; // Check the results GMatrix M; M.parseArff(buf, strlen(buf)); if(M.rows() != 7 || M.cols() != 4) throw Ex("failed"); if(M[0][0] != 0) throw Ex("failed"); if(M[0][1] != 3) throw Ex("failed"); if(M[1][1] != 2) throw Ex("failed"); if(M[2][1] != 3) throw Ex("failed"); if(M[3][3] != 2) throw Ex("failed"); if(M[4][0] != 0) throw Ex("failed"); if(M[5][1] != 3) throw Ex("failed"); if(M[6][2] != 1) throw Ex("failed"); if(M[6][3] != 1) throw Ex("failed"); }
// virtual void GPolynomial::trainInner(GMatrix& features, GMatrix& labels) { GMatrix labelCol(labels.rows(), 1); clear(); for(size_t i = 0; i < labels.cols(); i++) { GPolynomialSingleLabel* pPSL = new GPolynomialSingleLabel(m_controlPoints); m_polys.push_back(pPSL); labelCol.copyColumns(0, &labels, i, 1); pPSL->train(features, labelCol); } }
void transition(GArgReader& args) { // Load the input data GMatrix* pActions = loadData(args.pop_string()); Holder<GMatrix> hActions(pActions); GMatrix* pState = loadData(args.pop_string()); Holder<GMatrix> hState(pState); if(pState->rows() != pActions->rows()) ThrowError("Expected the same number of rows in both datasets"); // Parse options bool delta = false; while(args.size() > 0) { if(args.if_pop("-delta")) delta = true; else ThrowError("Invalid option: ", args.peek()); } // Make the output data size_t actionDims = pActions->cols(); size_t stateDims = pState->cols(); GMixedRelation* pRelation = new GMixedRelation(); sp_relation pRel = pRelation; pRelation->addAttrs(pActions->relation().get()); pRelation->addAttrs(stateDims + stateDims, 0); GMatrix* pTransition = new GMatrix(pRel); pTransition->newRows(pActions->rows() - 1); for(size_t i = 0; i < pActions->rows() - 1; i++) { double* pOut = pTransition->row(i); GVec::copy(pOut, pActions->row(i), actionDims); GVec::copy(pOut + actionDims, pState->row(i), stateDims); GVec::copy(pOut + actionDims + stateDims, pState->row(i + 1), stateDims); if(delta) GVec::subtract(pOut + actionDims + stateDims, pState->row(i), stateDims); } pTransition->print(cout); }
void dropColumns(GArgReader& args) { GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); vector<size_t> colList; size_t attrCount = pData->cols(); parseAttributeList(colList, args, attrCount); std::sort(colList.begin(), colList.end()); std::reverse(colList.begin(), colList.end()); for(size_t i = 0; i < colList.size(); i++) pData->deleteColumn(colList[i]); pData->print(cout); }
void GLinearRegressor::refine(const GMatrix& features, const GMatrix& labels, double learningRate, size_t epochs, double learningRateDecayFactor) { size_t fDims = features.cols(); size_t lDims = labels.cols(); size_t* pIndexes = new size_t[features.rows()]; ArrayHolder<size_t> hIndexes(pIndexes); GIndexVec::makeIndexVec(pIndexes, features.rows()); for(size_t i = 0; i < epochs; i++) { GIndexVec::shuffle(pIndexes, features.rows(), &m_rand); size_t* pIndex = pIndexes; for(size_t j = 0; j < features.rows(); j++) { const GVec& feat = features[*pIndex]; const GVec& lab = labels[*pIndex]; for(size_t k = 0; k < lDims; k++) { double err = lab[k] - (feat.dotProduct(m_pBeta->row(k)) + m_epsilon[k]); double lr = learningRate; double mag = 0.0; for(size_t l = 0; l < fDims; l++) { double d = feat[l] * err; mag += (d * d); } mag += err * err; if(mag > 1.0) lr /= mag; GVec& w = m_pBeta->row(k); for(size_t l = 0; l < fDims; l++) w[l] += feat[l] * lr * err; m_epsilon[k] += learningRate * err; } pIndex++; } learningRate *= learningRateDecayFactor; } }
///TODO: this command should be documented void center(GArgReader& args) { GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); unsigned int r = args.pop_uint(); size_t cols = pData->cols(); double* pRow = pData->row(r); for(size_t i = 0; i < r; ++i) GVec::subtract(pData->row(i), pRow, cols); for(size_t i = r + 1; i < pData->rows(); ++i) GVec::subtract(pData->row(i), pRow, cols); GVec::setAll(pRow, 0.0, cols); pData->print(cout); }
void shiftColumns(GArgReader& args) { GMatrix* pA = loadData(args.pop_string()); Holder<GMatrix> hA(pA); vector<size_t> cols; parseAttributeList(cols, args, pA->cols()); double offset = args.pop_double(); for(size_t i = 0; i < pA->rows(); i++) { double* pRow = pA->row(i); for(vector<size_t>::iterator it = cols.begin(); it != cols.end(); it++) pRow[*it] += offset; } pA->print(cout); }
// virtual void GBayesianModelAveraging::determineWeights(GMatrix& features, GMatrix& labels) { GTEMPBUF(double, results, labels.cols()); double m = -1e38; for(vector<GWeightedModel*>::iterator it = m_models.begin(); it != m_models.end(); it++) { (*it)->m_pModel->accuracy(features, labels, results); double d = GVec::sumElements(results, labels.cols()) / labels.cols(); double logProbHypothGivenData; if(d == 0.0) logProbHypothGivenData = -1e38; else if(d == 1.0) logProbHypothGivenData = 0.0; else logProbHypothGivenData = features.rows() * (d * log(d) + (1.0 - d) * log(1.0 - d)); m = std::max(m, logProbHypothGivenData); (*it)->m_weight = logProbHypothGivenData; } for(vector<GWeightedModel*>::iterator it = m_models.begin(); it != m_models.end(); it++) { double logProbHypothGivenData = (*it)->m_weight; (*it)->m_weight = exp(logProbHypothGivenData - m); } }
void cumulativeColumns(GArgReader& args) { GMatrix* pA = loadData(args.pop_string()); Holder<GMatrix> hA(pA); vector<size_t> cols; parseAttributeList(cols, args, pA->cols()); double* pPrevRow = pA->row(0); for(size_t i = 1; i < pA->rows(); i++) { double* pRow = pA->row(i); for(vector<size_t>::iterator it = cols.begin(); it != cols.end(); it++) pRow[*it] += pPrevRow[*it]; pPrevRow = pRow; } pA->print(cout); }
void GGaussianProcess::trainInnerInner(const GMatrix& features, const GMatrix& labels) { clear(); GMatrix* pL; { // Compute the kernel matrix GMatrix k(features.rows(), features.rows()); for(size_t i = 0; i < features.rows(); i++) { GVec& row = k[i]; const GVec& a = features[i]; for(size_t j = 0; j < features.rows(); j++) { const GVec& b = features[j]; row[j] = m_weightsPriorVar * m_pKernel->apply(a, b); } } // Add the noise variance to the diagonal of the kernel matrix for(size_t i = 0; i < features.rows(); i++) k[i][i] += m_noiseVar; // Compute L pL = k.cholesky(true); } std::unique_ptr<GMatrix> hL(pL); // Compute the model m_pLInv = pL->pseudoInverse(); GMatrix* pTmp = GMatrix::multiply(*m_pLInv, labels, false, false); std::unique_ptr<GMatrix> hTmp(pTmp); GMatrix* pLTrans = pL->transpose(); std::unique_ptr<GMatrix> hLTrans(pLTrans); GMatrix* pLTransInv = pLTrans->pseudoInverse(); std::unique_ptr<GMatrix> hLTransInv(pLTransInv); m_pAlpha = GMatrix::multiply(*pLTransInv, *pTmp, false, false); GAssert(m_pAlpha->rows() == features.rows()); GAssert(m_pAlpha->cols() == labels.cols()); m_pStoredFeatures = new GMatrix(); m_pStoredFeatures->copy(&features); }
GSparseMatrix* GRecommenderLib::loadSparseData(const char* szFilename) { // Load the dataset by extension PathData pd; GFile::parsePath(szFilename, &pd); if(_stricmp(szFilename + pd.extStart, ".arff") == 0) { // Convert a 3-column dense ARFF file to a sparse matrix GMatrix data; data.loadArff(szFilename); if(data.cols() != 3) throw Ex("Expected 3 columns: 0) user or row-index, 1) item or col-index, 2) value or rating"); double m0 = data.columnMin(0); double r0 = data.columnMax(0) - m0; double m1 = data.columnMin(1); double r1 = data.columnMax(1) - m1; if(m0 < 0 || m0 > 1e10 || r0 < 2 || r0 > 1e10) throw Ex("Invalid row indexes"); if(m1 < 0 || m1 > 1e10 || r1 < 2 || r1 > 1e10) throw Ex("Invalid col indexes"); GSparseMatrix* pMatrix = new GSparseMatrix(size_t(m0 + r0) + 1, size_t(m1 + r1) + 1, UNKNOWN_REAL_VALUE); std::unique_ptr<GSparseMatrix> hMatrix(pMatrix); for(size_t i = 0; i < data.rows(); i++) { GVec& row = data.row(i); pMatrix->set(size_t(row[0]), size_t(row[1]), row[2]); } return hMatrix.release(); } else if(_stricmp(szFilename + pd.extStart, ".sparse") == 0) { GDom doc; doc.loadJson(szFilename); return new GSparseMatrix(doc.root()); } throw Ex("Unsupported file format: ", szFilename + pd.extStart); return NULL; }
GSparseMatrix* loadSparseData(const char* szFilename) { // Load the dataset by extension PathData pd; GFile::parsePath(szFilename, &pd); if(_stricmp(szFilename + pd.extStart, ".arff") == 0) { // Convert a 3-column dense ARFF file to a sparse matrix GMatrix* pData = GMatrix::loadArff(szFilename); if(pData->cols() != 3) ThrowError("Expected 3 columns: 0) user or row-index, 1) item or col-index, 2) value or rating"); double m0, r0, m1, r1; pData->minAndRange(0, &m0, &r0); pData->minAndRange(1, &m1, &r1); if(m0 < 0 || m0 > 1e10 || r0 < 2 || r0 > 1e10) ThrowError("Invalid row indexes"); if(m1 < 0 || m1 > 1e10 || r1 < 2 || r1 > 1e10) ThrowError("Invalid col indexes"); GSparseMatrix* pMatrix = new GSparseMatrix(size_t(m0 + r0) + 1, size_t(m1 + r1) + 1, UNKNOWN_REAL_VALUE); Holder<GSparseMatrix> hMatrix(pMatrix); for(size_t i = 0; i < pData->rows(); i++) { double* pRow = pData->row(i); pMatrix->set(size_t(pRow[0]), size_t(pRow[1]), pRow[2]); } return hMatrix.release(); } else if(_stricmp(szFilename + pd.extStart, ".sparse") == 0) { GDom doc; doc.loadJson(szFilename); return new GSparseMatrix(doc.root()); } ThrowError("Unsupported file format: ", szFilename + pd.extStart); return NULL; }
// virtual void GResamplingAdaBoost::trainInnerInner(const GMatrix& features, const GMatrix& labels) { clear(); // Initialize all instances with uniform weights GVec pDistribution(features.rows()); pDistribution.fill(1.0 / features.rows()); size_t drawRows = size_t(m_trainSize * features.rows()); size_t* pDrawnIndexes = new size_t[drawRows]; std::unique_ptr<size_t[]> hDrawnIndexes(pDrawnIndexes); // Train the ensemble size_t labelDims = labels.cols(); double penalty = 1.0 / labelDims; GVec prediction(labelDims); for(size_t es = 0; es < m_ensembleSize; es++) { // Draw a training set from the distribution GCategoricalSamplerBatch csb(features.rows(), pDistribution, m_rand); csb.draw(drawRows, pDrawnIndexes); GMatrix drawnFeatures(features.relation().clone()); GReleaseDataHolder hDrawnFeatures(&drawnFeatures); GMatrix drawnLabels(labels.relation().clone()); GReleaseDataHolder hDrawnLabels(&drawnLabels); size_t* pIndex = pDrawnIndexes; for(size_t i = 0; i < drawRows; i++) { drawnFeatures.takeRow((GVec*)&features[*pIndex]); drawnLabels.takeRow((GVec*)&labels[*pIndex]); pIndex++; } // Train an instance of the model and store a clone of it m_pLearner->train(drawnFeatures, drawnLabels); GDom doc; GSupervisedLearner* pClone = m_pLoader->loadLearner(m_pLearner->serialize(&doc)); // Compute model weight double err = 0.5; for(size_t i = 0; i < features.rows(); i++) { pClone->predict(features[i], prediction); const GVec& target = labels[i]; for(size_t j = 0; j < labelDims; j++) { if((int)target[j] != (int)prediction[j]) err += penalty; } } err /= features.rows(); if(err >= 0.5) { delete(pClone); break; } double weight = 0.5 * log((1.0 - err) / err); m_models.push_back(new GWeightedModel(weight, pClone)); // Update the distribution to favor mis-classified instances for(size_t i = 0; i < features.rows(); i++) { err = 0.0; pClone->predict(features[i], prediction); const GVec& target = labels[i]; for(size_t j = 0; j < labelDims; j++) { if((int)target[j] != (int)prediction[j]) err += penalty; } err /= labelDims; pDistribution[i] *= exp(weight * (err * 2.0 - 1.0)); } pDistribution.sumToOne(); } normalizeWeights(); }