void Discretize(GArgReader& args) { // Load the file GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); // Parse Options size_t nFirst = 0; size_t nLast = pData->relation()->size() - 1; size_t nBuckets = std::max(2, (int)floor(sqrt((double)pData->rows() + 0.5))); while(args.size() > 0) { if(args.if_pop("-buckets")) nBuckets = args.pop_uint(); else if(args.if_pop("-colrange")) { nFirst = args.pop_uint(); nLast = args.pop_uint(); } else ThrowError("Invalid option: ", args.peek()); } if(nFirst < 0 || nLast >= pData->relation()->size() || nLast < nFirst) ThrowError("column index out of range"); // Discretize the continuous attributes in the specified range for(size_t i = nFirst; i <= nLast; i++) { if(pData->relation()->valueCount(i) != 0) continue; double min, range; pData->minAndRange(i, &min, &range); for(size_t j = 0; j < pData->rows(); j++) { double* pPat = pData->row(j); pPat[i] = (double)std::max((size_t)0, std::min(nBuckets - 1, (size_t)floor(((pPat[i] - min) * nBuckets) / range))); } ((GArffRelation*)pData->relation().get())->setAttrValueCount(i, nBuckets); } // Print results pData->print(cout); }
GSparseMatrix* loadSparseData(const char* szFilename) { // Load the dataset by extension PathData pd; GFile::parsePath(szFilename, &pd); if(_stricmp(szFilename + pd.extStart, ".arff") == 0) { // Convert a 3-column dense ARFF file to a sparse matrix GMatrix* pData = GMatrix::loadArff(szFilename); if(pData->cols() != 3) ThrowError("Expected 3 columns: 0) user or row-index, 1) item or col-index, 2) value or rating"); double m0, r0, m1, r1; pData->minAndRange(0, &m0, &r0); pData->minAndRange(1, &m1, &r1); if(m0 < 0 || m0 > 1e10 || r0 < 2 || r0 > 1e10) ThrowError("Invalid row indexes"); if(m1 < 0 || m1 > 1e10 || r1 < 2 || r1 > 1e10) ThrowError("Invalid col indexes"); GSparseMatrix* pMatrix = new GSparseMatrix(size_t(m0 + r0) + 1, size_t(m1 + r1) + 1, UNKNOWN_REAL_VALUE); Holder<GSparseMatrix> hMatrix(pMatrix); for(size_t i = 0; i < pData->rows(); i++) { double* pRow = pData->row(i); pMatrix->set(size_t(pRow[0]), size_t(pRow[1]), pRow[2]); } return hMatrix.release(); } else if(_stricmp(szFilename + pd.extStart, ".sparse") == 0) { GDom doc; doc.loadJson(szFilename); return new GSparseMatrix(doc.root()); } ThrowError("Unsupported file format: ", szFilename + pd.extStart); return NULL; }