void autoCorrelation(GArgReader& args) { GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); size_t lag = std::min((size_t)256, pData->rows() / 2); size_t dims = pData->cols(); GTEMPBUF(double, mean, dims); pData->centroid(mean); GMatrix ac(0, dims + 1); for(size_t i = 1; i <= lag; i++) { double* pRow = ac.newRow(); *(pRow++) = (double)i; for(size_t j = 0; j < dims; j++) { *pRow = 0; size_t k; for(k = 0; k + i < pData->rows(); k++) { double* pA = pData->row(k); double* pB = pData->row(k + i); *pRow += (pA[j] - mean[j]) * (pB[j] - mean[j]); } *pRow /= k; pRow++; } } ac.print(cout); }
void TransformData(const double* pVector) { m_transform.fromVector(pVector + m_attrs, m_attrs); for(size_t i = 0; i < m_pData2->rows(); i++) { double* pPatIn = m_pData2->row(i); double* pPatOut = m_transformed.row(i); m_transform.multiply(pPatIn, pPatOut); GVec::add(pPatOut, pVector, m_attrs); } }
///TODO: this command should be documented void center(GArgReader& args) { GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); unsigned int r = args.pop_uint(); size_t cols = pData->cols(); double* pRow = pData->row(r); for(size_t i = 0; i < r; ++i) GVec::subtract(pData->row(i), pRow, cols); for(size_t i = r + 1; i < pData->rows(); ++i) GVec::subtract(pData->row(i), pRow, cols); GVec::setAll(pRow, 0.0, cols); pData->print(cout); }
void dropRandomValues(GArgReader& args) { GMatrix* pData = loadData(args.pop_string()); double portion = args.pop_double(); // Parse the options unsigned int seed = getpid() * (unsigned int)time(NULL); while(args.next_is_flag()) { if(args.if_pop("-seed")) seed = args.pop_uint(); else ThrowError("Invalid option: ", args.peek()); } GRand rand(seed); size_t n = pData->rows() * pData->cols(); size_t k = size_t(portion * n); for(size_t i = 0; i < pData->cols(); i++) { size_t vals = pData->relation()->valueCount(i); if(vals == 0) { for(size_t j = 0; j < pData->rows(); j++) { if(rand.next(n) < k) { pData->row(j)[i] = UNKNOWN_REAL_VALUE; k--; } n--; } } else { for(size_t j = 0; j < pData->rows(); j++) { if(rand.next(n) < k) { pData->row(j)[i] = UNKNOWN_DISCRETE_VALUE; k--; } n--; } } } pData->print(cout); }
void cumulativeColumns(GArgReader& args) { GMatrix* pA = loadData(args.pop_string()); Holder<GMatrix> hA(pA); vector<size_t> cols; parseAttributeList(cols, args, pA->cols()); double* pPrevRow = pA->row(0); for(size_t i = 1; i < pA->rows(); i++) { double* pRow = pA->row(i); for(vector<size_t>::iterator it = cols.begin(); it != cols.end(); it++) pRow[*it] += pPrevRow[*it]; pPrevRow = pRow; } pA->print(cout); }
void addNoise(GArgReader& args) { GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); double dev = args.pop_double(); // Parse the options unsigned int seed = getpid() * (unsigned int)time(NULL); int excludeLast = 0; while(args.next_is_flag()) { if(args.if_pop("-seed")) seed = args.pop_uint(); else if(args.if_pop("-excludelast")) excludeLast = args.pop_uint(); else ThrowError("Invalid neighbor finder option: ", args.peek()); } GRand prng(seed); size_t cols = pData->cols() - excludeLast; for(size_t r = 0; r < pData->rows(); r++) { double* pRow = pData->row(r); for(size_t c = 0; c < cols; c++) *(pRow++) += dev * prng.normal(); } pData->print(cout); }
void enumerateValues(GArgReader& args) { GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); size_t col = args.pop_uint(); if(pData->relation()->valueCount(col) > 0) ((GArffRelation*)pData->relation().get())->setAttrValueCount(col, 0); else { size_t n = 0; map<double,size_t> themap; for(size_t i = 0; i < pData->rows(); i++) { double* pRow = pData->row(i); map<double,size_t>::iterator it = themap.find(pRow[col]); if(it == themap.end()) { themap[pRow[col]] = n; pRow[col] = (double)n; n++; } else pRow[col] = (double)it->second; } } pData->print(cout); }
void DropMissingValues(GArgReader& args) { GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); GRelation* pRelation = pData->relation().get(); size_t dims = pRelation->size(); for(size_t i = pData->rows() - 1; i < pData->rows(); i--) { double* pPat = pData->row(i); bool drop = false; for(size_t j = 0; j < dims; j++) { if(pRelation->valueCount(j) == 0) { if(pPat[j] == UNKNOWN_REAL_VALUE) { drop = true; break; } } else { if(pPat[j] == UNKNOWN_DISCRETE_VALUE) { drop = true; break; } } } if(drop) pData->deleteRow(i); } pData->print(cout); }
GMatrix::GMatrix(GMatrix& m) { m_row = m.row(); m_col = m.col(); int size = m_row*m_col; m_data = new double[size]; for(int i = 0; i < size; ++i) { m_data[i] = m[i]; } }
void transition(GArgReader& args) { // Load the input data GMatrix* pActions = loadData(args.pop_string()); Holder<GMatrix> hActions(pActions); GMatrix* pState = loadData(args.pop_string()); Holder<GMatrix> hState(pState); if(pState->rows() != pActions->rows()) ThrowError("Expected the same number of rows in both datasets"); // Parse options bool delta = false; while(args.size() > 0) { if(args.if_pop("-delta")) delta = true; else ThrowError("Invalid option: ", args.peek()); } // Make the output data size_t actionDims = pActions->cols(); size_t stateDims = pState->cols(); GMixedRelation* pRelation = new GMixedRelation(); sp_relation pRel = pRelation; pRelation->addAttrs(pActions->relation().get()); pRelation->addAttrs(stateDims + stateDims, 0); GMatrix* pTransition = new GMatrix(pRel); pTransition->newRows(pActions->rows() - 1); for(size_t i = 0; i < pActions->rows() - 1; i++) { double* pOut = pTransition->row(i); GVec::copy(pOut, pActions->row(i), actionDims); GVec::copy(pOut + actionDims, pState->row(i), stateDims); GVec::copy(pOut + actionDims + stateDims, pState->row(i + 1), stateDims); if(delta) GVec::subtract(pOut + actionDims + stateDims, pState->row(i), stateDims); } pTransition->print(cout); }
void splitFold(GArgReader& args) { // Load GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); size_t fold = args.pop_uint(); size_t folds = args.pop_uint(); if(fold >= folds) ThrowError("fold index out of range. It must be less than the total number of folds."); // Options string filenameTrain = "train.arff"; string filenameTest = "test.arff"; while(args.size() > 0) { if(args.if_pop("-out")) { filenameTrain = args.pop_string(); filenameTest = args.pop_string(); } else ThrowError("Invalid option: ", args.peek()); } // Copy relevant portions of the data GMatrix train(pData->relation()); GMatrix test(pData->relation()); size_t begin = pData->rows() * fold / folds; size_t end = pData->rows() * (fold + 1) / folds; for(size_t i = 0; i < begin; i++) train.copyRow(pData->row(i)); for(size_t i = begin; i < end; i++) test.copyRow(pData->row(i)); for(size_t i = end; i < pData->rows(); i++) train.copyRow(pData->row(i)); train.saveArff(filenameTrain.c_str()); test.saveArff(filenameTest.c_str()); }
void shiftColumns(GArgReader& args) { GMatrix* pA = loadData(args.pop_string()); Holder<GMatrix> hA(pA); vector<size_t> cols; parseAttributeList(cols, args, pA->cols()); double offset = args.pop_double(); for(size_t i = 0; i < pA->rows(); i++) { double* pRow = pA->row(i); for(vector<size_t>::iterator it = cols.begin(); it != cols.end(); it++) pRow[*it] += offset; } pA->print(cout); }
void Discretize(GArgReader& args) { // Load the file GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); // Parse Options size_t nFirst = 0; size_t nLast = pData->relation()->size() - 1; size_t nBuckets = std::max(2, (int)floor(sqrt((double)pData->rows() + 0.5))); while(args.size() > 0) { if(args.if_pop("-buckets")) nBuckets = args.pop_uint(); else if(args.if_pop("-colrange")) { nFirst = args.pop_uint(); nLast = args.pop_uint(); } else ThrowError("Invalid option: ", args.peek()); } if(nFirst < 0 || nLast >= pData->relation()->size() || nLast < nFirst) ThrowError("column index out of range"); // Discretize the continuous attributes in the specified range for(size_t i = nFirst; i <= nLast; i++) { if(pData->relation()->valueCount(i) != 0) continue; double min, range; pData->minAndRange(i, &min, &range); for(size_t j = 0; j < pData->rows(); j++) { double* pPat = pData->row(j); pPat[i] = (double)std::max((size_t)0, std::min(nBuckets - 1, (size_t)floor(((pPat[i] - min) * nBuckets) / range))); } ((GArffRelation*)pData->relation().get())->setAttrValueCount(i, nBuckets); } // Print results pData->print(cout); }
void Export(GArgReader& args) { // Load GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); // Parse options const char* separator = ","; while(args.size() > 0) { if(args.if_pop("-tab")) separator = " "; else if(args.if_pop("-space")) separator = " "; else ThrowError("Invalid option: ", args.peek()); } // Print for(size_t i = 0; i < pData->rows(); i++) pData->relation()->printRow(cout, pData->row(i), separator); }
GSparseMatrix* GRecommenderLib::loadSparseData(const char* szFilename) { // Load the dataset by extension PathData pd; GFile::parsePath(szFilename, &pd); if(_stricmp(szFilename + pd.extStart, ".arff") == 0) { // Convert a 3-column dense ARFF file to a sparse matrix GMatrix data; data.loadArff(szFilename); if(data.cols() != 3) throw Ex("Expected 3 columns: 0) user or row-index, 1) item or col-index, 2) value or rating"); double m0 = data.columnMin(0); double r0 = data.columnMax(0) - m0; double m1 = data.columnMin(1); double r1 = data.columnMax(1) - m1; if(m0 < 0 || m0 > 1e10 || r0 < 2 || r0 > 1e10) throw Ex("Invalid row indexes"); if(m1 < 0 || m1 > 1e10 || r1 < 2 || r1 > 1e10) throw Ex("Invalid col indexes"); GSparseMatrix* pMatrix = new GSparseMatrix(size_t(m0 + r0) + 1, size_t(m1 + r1) + 1, UNKNOWN_REAL_VALUE); std::unique_ptr<GSparseMatrix> hMatrix(pMatrix); for(size_t i = 0; i < data.rows(); i++) { GVec& row = data.row(i); pMatrix->set(size_t(row[0]), size_t(row[1]), row[2]); } return hMatrix.release(); } else if(_stricmp(szFilename + pd.extStart, ".sparse") == 0) { GDom doc; doc.loadJson(szFilename); return new GSparseMatrix(doc.root()); } throw Ex("Unsupported file format: ", szFilename + pd.extStart); return NULL; }
GSparseMatrix* loadSparseData(const char* szFilename) { // Load the dataset by extension PathData pd; GFile::parsePath(szFilename, &pd); if(_stricmp(szFilename + pd.extStart, ".arff") == 0) { // Convert a 3-column dense ARFF file to a sparse matrix GMatrix* pData = GMatrix::loadArff(szFilename); if(pData->cols() != 3) ThrowError("Expected 3 columns: 0) user or row-index, 1) item or col-index, 2) value or rating"); double m0, r0, m1, r1; pData->minAndRange(0, &m0, &r0); pData->minAndRange(1, &m1, &r1); if(m0 < 0 || m0 > 1e10 || r0 < 2 || r0 > 1e10) ThrowError("Invalid row indexes"); if(m1 < 0 || m1 > 1e10 || r1 < 2 || r1 > 1e10) ThrowError("Invalid col indexes"); GSparseMatrix* pMatrix = new GSparseMatrix(size_t(m0 + r0) + 1, size_t(m1 + r1) + 1, UNKNOWN_REAL_VALUE); Holder<GSparseMatrix> hMatrix(pMatrix); for(size_t i = 0; i < pData->rows(); i++) { double* pRow = pData->row(i); pMatrix->set(size_t(pRow[0]), size_t(pRow[1]), pRow[2]); } return hMatrix.release(); } else if(_stricmp(szFilename + pd.extStart, ".sparse") == 0) { GDom doc; doc.loadJson(szFilename); return new GSparseMatrix(doc.root()); } ThrowError("Unsupported file format: ", szFilename + pd.extStart); return NULL; }
void fillMissingValues(GArgReader& args) { unsigned int seed = getpid() * (unsigned int)time(NULL); while(args.next_is_flag()) { if(args.if_pop("-seed")) seed = args.pop_uint(); else ThrowError("Invalid option: ", args.peek()); } // Load the data and the filter GMatrix* pDataOrig = GMatrix::loadArff(args.pop_string()); Holder<GMatrix> hDataOrig(pDataOrig); sp_relation pOrigRel = pDataOrig->relation(); GRand prng(seed); GCollaborativeFilter* pModel = InstantiateAlgorithm(prng, args); Holder<GCollaborativeFilter> hModel(pModel); if(args.size() > 0) ThrowError("Superfluous argument: ", args.peek()); // Convert to all normalized real values GNominalToCat* pNtc = new GNominalToCat(); GTwoWayTransformChainer filter(new GNormalize(), pNtc); pNtc->preserveUnknowns(); filter.train(*pDataOrig); GMatrix* pData = filter.transformBatch(*pDataOrig); Holder<GMatrix> hData(pData); hDataOrig.release(); pDataOrig = NULL; // Convert to 3-column form GMatrix* pMatrix = new GMatrix(0, 3); Holder<GMatrix> hMatrix(pMatrix); size_t dims = pData->cols(); for(size_t i = 0; i < pData->rows(); i++) { double* pRow = pData->row(i); for(size_t j = 0; j < dims; j++) { if(*pRow != UNKNOWN_REAL_VALUE) { double* pVec = pMatrix->newRow(); pVec[0] = i; pVec[1] = j; pVec[2] = *pRow; } pRow++; } } // Train the collaborative filter pModel->train(*pMatrix); hMatrix.release(); pMatrix = NULL; // Predict values for missing elements for(size_t i = 0; i < pData->rows(); i++) { double* pRow = pData->row(i); for(size_t j = 0; j < dims; j++) { if(*pRow == UNKNOWN_REAL_VALUE) *pRow = pModel->predict(i, j); GAssert(*pRow != UNKNOWN_REAL_VALUE); pRow++; } } // Convert the data back to its original form GMatrix* pOut = filter.untransformBatch(*pData); pOut->setRelation(pOrigRel); pOut->print(cout); }
void Train(GArgReader &args) { // Load series from file std::unique_ptr<GMatrix> hSeries, hFeatures; LoadData(args, hSeries); GMatrix *pSeries = hSeries.get(); // Split features/labels if(pSeries->cols() == 2) { GMatrix *pFeatures = pSeries->cloneSub(0, 0, pSeries->rows(), 1); GMatrix *pLabels = pSeries->cloneSub(0, 1, pSeries->rows(), 1); hFeatures.reset(pFeatures); hSeries.reset(pLabels); pSeries = pLabels; } else if(pSeries->cols() > 2) { throw Ex("Too many columns!"); } // Parse options GNeuralDecomposition *nd = new GNeuralDecomposition(); while(args.next_is_flag()) { if(args.if_pop("-regularization")) nd->setRegularization(args.pop_double()); else if(args.if_pop("-learningRate")) nd->setLearningRate(args.pop_double()); else if(args.if_pop("-linearUnits")) nd->setLinearUnits(args.pop_uint()); else if(args.if_pop("-softplusUnits")) nd->setSoftplusUnits(args.pop_uint()); else if(args.if_pop("-sigmoidUnits")) nd->setSigmoidUnits(args.pop_uint()); else if(args.if_pop("-epochs")) nd->setEpochs(args.pop_uint()); else if(args.if_pop("-features")) LoadData(args, hFeatures); else if(args.if_pop("-filterLogarithm")) nd->setFilterLogarithm(true); else throw Ex("Invalid option: ", args.peek()); } if(hFeatures.get() == NULL) { // Generate features GMatrix *pFeatures = new GMatrix(pSeries->rows(), 1); for(size_t i = 0; i < pSeries->rows(); i++) { pFeatures->row(i)[0] = i / (double) pSeries->rows(); } hFeatures.reset(pFeatures); } // Train GMatrix *pFeatures = hFeatures.get(); nd->train(*pFeatures, *pSeries); // Output the trained model GDom doc; doc.setRoot(nd->serialize(&doc)); doc.writeJson(cout); }
// virtual GMatrix* GGraphCutTransducer::transduceInner(const GMatrix& features1, const GMatrix& labels1, const GMatrix& features2) { // Use k-NN to compute a distance metric with good scale factors for prediction GKNN knn; knn.setNeighborCount(m_neighborCount); //knn.setOptimizeScaleFactors(true); knn.train(features1, labels1); GRowDistanceScaled* pMetric = knn.metric(); // Merge the features into one dataset and build a kd-tree GMatrix both(features1.relation().clone()); GReleaseDataHolder hBoth(&both); both.reserve(features1.rows() + features2.rows()); for(size_t i = 0; i < features1.rows(); i++) both.takeRow((double*)features1[i]); for(size_t i = 0; i < features2.rows(); i++) both.takeRow((double*)features2[i]); GRowDistanceScaled metric2; GKdTree neighborFinder(&both, m_neighborCount, &metric2, false); GVec::copy(metric2.scaleFactors(), pMetric->scaleFactors(), features1.cols()); // Transduce GMatrix* pOut = new GMatrix(labels1.relation().clone()); Holder<GMatrix> hOut(pOut); pOut->newRows(features2.rows()); pOut->setAll(0); for(size_t lab = 0; lab < labels1.cols(); lab++) { // Use max-flow/min-cut graph-cut to separate out each label value int valueCount = (int)labels1.relation().valueCount(lab); for(int val = 1; val < valueCount; val++) { // Add neighborhood edges GGraphCut gc(features1.rows() + features2.rows() + 2); for(size_t i = 0; i < both.rows(); i++) { neighborFinder.neighbors(m_pNeighbors, m_pDistances, i); for(size_t j = 0; j < m_neighborCount; j++) { if(m_pNeighbors[j] >= both.rows()) continue; gc.addEdge(2 + i, 2 + m_pNeighbors[j], (float)(1.0 / std::max(sqrt(m_pDistances[j]), 1e-9))); // connect neighbors } } // Add source and sink edges for(size_t i = 0; i < features1.rows(); i++) { if((int)labels1[i][0] == val) gc.addEdge(0, 2 + i, 1e12f); // connect to source else gc.addEdge(1, 2 + i, 1e12f); // connect to sink } // Cut gc.cut(0, 1); // Label the unlabeled rows for(size_t i = 0; i < features2.rows(); i++) { if(gc.isSource(2 + features1.rows() + i)) pOut->row(i)[lab] = (double)val; } } } return hOut.release(); }
void GRecommenderLib::fillMissingValues(GArgReader& args) { unsigned int seed = getpid() * (unsigned int)time(NULL); bool normalize = true; while(args.next_is_flag()) { if(args.if_pop("-seed")) seed = args.pop_uint(); else if(args.if_pop("-nonormalize")) normalize = false; else throw Ex("Invalid option: ", args.peek()); } // Load the data and the filter GMatrix dataOrig; dataOrig.loadArff(args.pop_string()); // Parse params vector<size_t> ignore; while(args.next_is_flag()) { if(args.if_pop("-ignore")) parseAttributeList(ignore, args, dataOrig.cols()); else throw Ex("Invalid option: ", args.peek()); } // Throw out the ignored attributes std::sort(ignore.begin(), ignore.end()); for(size_t i = ignore.size() - 1; i < ignore.size(); i--) dataOrig.deleteColumns(ignore[i], 1); GRelation* pOrigRel = dataOrig.relation().clone(); std::unique_ptr<GRelation> hOrigRel(pOrigRel); GCollaborativeFilter* pModel = InstantiateAlgorithm(args); std::unique_ptr<GCollaborativeFilter> hModel(pModel); if(args.size() > 0) throw Ex("Superfluous argument: ", args.peek()); pModel->rand().setSeed(seed); // Convert to all normalized real values GNominalToCat* pNtc = new GNominalToCat(); GIncrementalTransform* pFilter = pNtc; std::unique_ptr<GIncrementalTransformChainer> hChainer; if(normalize) { GIncrementalTransformChainer* pChainer = new GIncrementalTransformChainer(new GNormalize(), pNtc); hChainer.reset(pChainer); pFilter = pChainer; } pNtc->preserveUnknowns(); pFilter->train(dataOrig); GMatrix* pData = pFilter->transformBatch(dataOrig); std::unique_ptr<GMatrix> hData(pData); // Convert to 3-column form GMatrix* pMatrix = new GMatrix(0, 3); std::unique_ptr<GMatrix> hMatrix(pMatrix); size_t dims = pData->cols(); for(size_t i = 0; i < pData->rows(); i++) { GVec& row = pData->row(i); for(size_t j = 0; j < dims; j++) { if(row[j] != UNKNOWN_REAL_VALUE) { GVec& vec = pMatrix->newRow(); vec[0] = (double)i; vec[1] = (double)j; vec[2] = row[j]; } } } // Train the collaborative filter pModel->train(*pMatrix); hMatrix.release(); pMatrix = NULL; // Predict values for missing elements for(size_t i = 0; i < pData->rows(); i++) { GVec& row = pData->row(i); for(size_t j = 0; j < dims; j++) { if(row[j] == UNKNOWN_REAL_VALUE) row[j] = pModel->predict(i, j); GAssert(row[j] != UNKNOWN_REAL_VALUE); } } // Convert the data back to its original form GMatrix* pOut = pFilter->untransformBatch(*pData); pOut->setRelation(hOrigRel.release()); pOut->print(cout); }
void principalComponentAnalysis(GArgReader& args) { // Load the file GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); int nTargetDims = args.pop_uint(); // Parse options string roundTrip; unsigned int seed = getpid() * (unsigned int)time(NULL); string eigenvalues; string components; string modelIn; string modelOut; bool aboutOrigin = false; while(args.next_is_flag()) { if(args.if_pop("-seed")) seed = args.pop_uint(); else if(args.if_pop("-roundtrip")) roundTrip = args.pop_string(); else if(args.if_pop("-eigenvalues")) eigenvalues = args.pop_string(); else if(args.if_pop("-components")) components = args.pop_string(); else if(args.if_pop("-aboutorigin")) aboutOrigin = true; else if(args.if_pop("-modelin")) modelIn = args.pop_string(); else if(args.if_pop("-modelout")) modelOut = args.pop_string(); else throw Ex("Invalid option: ", args.peek()); } // Transform the data GRand prng(seed); GPCA* pTransform = NULL; if(modelIn.length() > 0) { GDom doc; doc.loadJson(modelIn.c_str()); GLearnerLoader ll(prng); pTransform = new GPCA(doc.root(), ll); } else { pTransform = new GPCA(nTargetDims, &prng); if(aboutOrigin) pTransform->aboutOrigin(); if(eigenvalues.length() > 0) pTransform->computeEigVals(); pTransform->train(*pData); } Holder<GPCA> hTransform(pTransform); GMatrix* pDataAfter = pTransform->transformBatch(*pData); Holder<GMatrix> hDataAfter(pDataAfter); // Save the eigenvalues if(eigenvalues.length() > 0) { GArffRelation* pRelation = new GArffRelation(); pRelation->addAttribute("eigenvalues", 0, NULL); sp_relation pRel = pRelation; GMatrix dataEigenvalues(pRel); dataEigenvalues.newRows(nTargetDims); double* pEigVals = pTransform->eigVals(); for(int i = 0; i < nTargetDims; i++) dataEigenvalues[i][0] = pEigVals[i]; dataEigenvalues.saveArff(eigenvalues.c_str()); } // Save the components if(components.length() > 0) pTransform->components()->saveArff(components.c_str()); // Do the round-trip if(roundTrip.size() > 0) { GMatrix roundTripped(pData->rows(), pData->cols()); for(size_t i = 0; i < pData->rows(); i++) pTransform->untransform(pDataAfter->row(i), roundTripped.row(i)); roundTripped.saveArff(roundTrip.c_str()); } if(modelOut.length() > 0) { GDom doc; doc.setRoot(pTransform->serialize(&doc)); doc.saveJson(modelOut.c_str()); } pDataAfter->print(cout); }
void significance(GArgReader& args) { GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); int attr1 = args.pop_uint(); int attr2 = args.pop_uint(); // Parse options double tolerance = 0.001; while(args.size() > 0) { if(args.if_pop("-tol")) tolerance = args.pop_double(); else ThrowError("Invalid option: ", args.peek()); } // Print some basic stats cout.precision(8); { cout << "### Some basic stats\n"; cout << "Medians = " << pData->median(attr1) << ", " << pData->median(attr2) << "\n"; double mean1 = pData->mean(attr1); double mean2 = pData->mean(attr2); cout << "Means = " << mean1 << ", " << mean2 << "\n"; double var1 = pData->variance(attr1, mean1); double var2 = pData->variance(attr2, mean2); cout << "Standard deviations = " << sqrt(var1) << ", " << sqrt(var2) << "\n"; int less = 0; int eq = 0; int more = 0; for(size_t i = 0; i < pData->rows(); i++) { double* pRow = pData->row(i); if(std::abs(pRow[attr1] - pRow[attr2]) < tolerance) eq++; else if(pRow[attr1] < pRow[attr2]) less++; else more++; } cout << less << " less, " << eq << " same, " << more << " greater\n"; } // Perform the significance tests { cout << "\n### Paired T-test\n"; size_t v; double t; pData->pairedTTest(&v, &t, attr1, attr2, false); double p = GMath::tTestAlphaValue(v, t); cout << "v=" << v << ", t=" << t << ", p=" << p << "\n"; } { cout << "\n### Paired T-test with normalized values\n"; size_t v; double t; pData->pairedTTest(&v, &t, attr1, attr2, true); double p = GMath::tTestAlphaValue(v, t); cout << "v=" << v << ", t=" << t << ", p=" << p << "\n"; } { cout << "\n### Wilcoxon Signed Ranks Test"; int num; double wMinus, wPlus; pData->wilcoxonSignedRanksTest(attr1, attr2, tolerance, &num, &wMinus, &wPlus); cout << "Number of signed ranks: " << num << "\n"; double w_min = std::min(wMinus, wPlus); double w_sum = wPlus - wMinus; cout << "W- = " << wMinus << ", W+ = " << wPlus << ", W_min = " << w_min << ", W_sum = " << w_sum << "\n"; double p_min = 0.5 * GMath::wilcoxonPValue(num, w_min); if(num < 10) cout << "Because the number of signed ranks is small, you should use a lookup table, rather than rely on the normal approximation for the P-value.\n"; cout << "One-tailed P-value (for directional comparisons) computed with a normal approximation using W_min = " << 0.5 * p_min << "\n"; cout << "Two-tailed P-value (for non-directional comparisons) computed with a normal approximation using W_min = " << p_min << "\n"; cout << "To show that something is \"better\" than something else, use the one-tailed P-value.\n"; cout << "Commonly, a P-value less that 0.05 is considered to be significant.\n"; /* double p_sum = GMath::wilcoxonPValue(num, w_sum); cout << "Directional (one-tailed) P-value computed with W_sum = " << p_sum << "\n"; */ } }
// virtual GMatrix* GAgglomerativeTransducer::transduceInner(const GMatrix& features1, const GMatrix& labels1, const GMatrix& features2) { // Init the metric if(!m_pMetric) setMetric(new GRowDistance(), true); m_pMetric->init(&features1.relation(), false); // Make a dataset with all featuers GMatrix featuresAll(features1.relation().clone()); featuresAll.reserve(features1.rows() + features2.rows()); GReleaseDataHolder hFeaturesAll(&featuresAll); for(size_t i = 0; i < features1.rows(); i++) featuresAll.takeRow((double*)features1[i]); for(size_t i = 0; i < features2.rows(); i++) featuresAll.takeRow((double*)features2[i]); // Find enough neighbors to form a connected graph GNeighborGraph* pNF = NULL; size_t neighbors = 6; while(true) { GKdTree* pKdTree = new GKdTree(&featuresAll, neighbors, m_pMetric, false); pNF = new GNeighborGraph(pKdTree, true); pNF->fillCache(); if(pNF->isConnected()) break; if(neighbors + 1 >= featuresAll.rows()) { delete(pNF); throw Ex("internal problem--a graph with so many neighbors must be connected"); } neighbors = std::min((neighbors * 3) / 2, featuresAll.rows() - 1); } // Sort all the neighbors by their distances size_t count = featuresAll.rows() * neighbors; vector< std::pair<double,size_t> > distNeighs; distNeighs.resize(count); double* pDistances = pNF->squaredDistanceTable(); size_t* pRows = pNF->cache(); size_t index = 0; vector< std::pair<double,size_t> >::iterator it = distNeighs.begin(); for(size_t i = 0; i < count; i++) { if(*pRows < featuresAll.rows()) { it->first = *pDistances; it->second = i; it++; } else index--; pRows++; pDistances++; } std::sort(distNeighs.begin(), it); // Transduce GMatrix* pOut = new GMatrix(labels1.relation().clone()); Holder<GMatrix> hOut(pOut); pOut->newRows(features2.rows()); pOut->setAll(-1); size_t* pSiblings = new size_t[featuresAll.rows()]; // a cyclical linked list of each row in the cluster ArrayHolder<size_t> hSiblings(pSiblings); for(size_t lab = 0; lab < labels1.cols(); lab++) { // Assign each row to its own cluster GIndexVec::makeIndexVec(pSiblings, featuresAll.rows()); // init such that each row is in a cluster of 1 size_t missingLabels = features2.rows(); // Merge until we have the desired number of clusters pRows = pNF->cache(); for(vector< std::pair<double,size_t> >::iterator dn = distNeighs.begin(); dn != it; dn++) { // Get the next two closest points size_t a = dn->second / neighbors; size_t b = pRows[dn->second]; GAssert(a != b && a < featuresAll.rows() && b < featuresAll.rows()); int labelA = (a < features1.rows() ? (int)labels1[a][lab] : (int)pOut->row(a - features1.rows())[lab]); int labelB = (b < features1.rows() ? (int)labels1[b][lab] : (int)pOut->row(b - features1.rows())[lab]); // Merge the clusters if(labelA >= 0 && labelB >= 0) continue; // Both points are already labeled, so there is no point in merging their clusters if(labelA < 0 && labelB >= 0) // Make sure that if one of them has a valid label, it is point a { std::swap(a, b); std::swap(labelA, labelB); } if(labelA >= 0) { for(size_t i = pSiblings[b]; true; i = pSiblings[i]) // Label every row in cluster b { GAssert(i >= features1.rows()); GAssert(pOut->row(i - features1.rows())[lab] == (double)-1); pOut->row(i - features1.rows())[lab] = labelA; missingLabels--; if(i == b) break; } if(missingLabels <= 0) break; } std::swap(pSiblings[a], pSiblings[b]); // This line joins the cyclical linked lists into one big cycle } } return hOut.release(); }
void neuroPCA(GArgReader& args) { // Load the file GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); int nTargetDims = args.pop_uint(); // Parse options string roundTrip; unsigned int seed = getpid() * (unsigned int)time(NULL); bool trainBias = true; bool linear = false; string eigenvalues = ""; while(args.next_is_flag()) { if(args.if_pop("-seed")) seed = args.pop_uint(); else if(args.if_pop("-clampbias")) trainBias = false; else if(args.if_pop("-linear")) linear = true; else if(args.if_pop("-eigenvalues")) eigenvalues = args.pop_string(); else throw Ex("Invalid option: ", args.peek()); } // Transform the data GRand prng(seed); GNeuroPCA transform(nTargetDims, &prng); if(!trainBias) transform.clampBias(); if(linear) transform.setActivation(new GActivationIdentity()); if(eigenvalues.length() > 0) transform.computeEigVals(); GMatrix* pDataAfter = transform.doit(*pData); Holder<GMatrix> hDataAfter(pDataAfter); // Save the eigenvalues if(eigenvalues.length() > 0) { GArffRelation* pRelation = new GArffRelation(); pRelation->addAttribute("eigenvalues", 0, NULL); sp_relation pRel = pRelation; GMatrix dataEigenvalues(pRel); dataEigenvalues.newRows(nTargetDims); double* pEigVals = transform.eigVals(); for(int i = 0; i < nTargetDims; i++) dataEigenvalues[i][0] = pEigVals[i]; dataEigenvalues.saveArff(eigenvalues.c_str()); } // In linear mode, people usually expect normalized eigenvectors, so let's normalize them now if(linear) { GMatrix* pWeights = transform.weights(); GAssert(pWeights->cols() == pData->cols()); for(int i = 0; i < nTargetDims; i++) { double scal = sqrt(GVec::squaredMagnitude(pWeights->row(i + 1), pWeights->cols())); for(size_t j = 0; j < pDataAfter->rows(); j++) pDataAfter->row(j)[i] *= scal; } } pDataAfter->print(cout); }