void GRecommenderLib::precisionRecall(GArgReader& args) { // Parse options unsigned int seed = getpid() * (unsigned int)time(NULL); bool ideal = false; while(args.next_is_flag()) { if(args.if_pop("-seed")) seed = args.pop_uint(); else if(args.if_pop("-ideal")) ideal = true; else throw Ex("Invalid option: ", args.peek()); } // Load the data if(args.size() < 1) throw Ex("No dataset specified."); GMatrix data; loadData(data, args.pop_string()); // Instantiate the recommender GCollaborativeFilter* pModel = InstantiateAlgorithm(args); std::unique_ptr<GCollaborativeFilter> hModel(pModel); if(args.size() > 0) throw Ex("Superfluous argument: ", args.peek()); pModel->rand().setSeed(seed); // Generate precision-recall data GMatrix* pResults = pModel->precisionRecall(data, ideal); std::unique_ptr<GMatrix> hResults(pResults); pResults->deleteColumns(2, 1); // we don't need the false-positive rate column pResults->print(cout); }
void GRecommenderLib::fillMissingValues(GArgReader& args) { unsigned int seed = getpid() * (unsigned int)time(NULL); bool normalize = true; while(args.next_is_flag()) { if(args.if_pop("-seed")) seed = args.pop_uint(); else if(args.if_pop("-nonormalize")) normalize = false; else throw Ex("Invalid option: ", args.peek()); } // Load the data and the filter GMatrix dataOrig; dataOrig.loadArff(args.pop_string()); // Parse params vector<size_t> ignore; while(args.next_is_flag()) { if(args.if_pop("-ignore")) parseAttributeList(ignore, args, dataOrig.cols()); else throw Ex("Invalid option: ", args.peek()); } // Throw out the ignored attributes std::sort(ignore.begin(), ignore.end()); for(size_t i = ignore.size() - 1; i < ignore.size(); i--) dataOrig.deleteColumns(ignore[i], 1); GRelation* pOrigRel = dataOrig.relation().clone(); std::unique_ptr<GRelation> hOrigRel(pOrigRel); GCollaborativeFilter* pModel = InstantiateAlgorithm(args); std::unique_ptr<GCollaborativeFilter> hModel(pModel); if(args.size() > 0) throw Ex("Superfluous argument: ", args.peek()); pModel->rand().setSeed(seed); // Convert to all normalized real values GNominalToCat* pNtc = new GNominalToCat(); GIncrementalTransform* pFilter = pNtc; std::unique_ptr<GIncrementalTransformChainer> hChainer; if(normalize) { GIncrementalTransformChainer* pChainer = new GIncrementalTransformChainer(new GNormalize(), pNtc); hChainer.reset(pChainer); pFilter = pChainer; } pNtc->preserveUnknowns(); pFilter->train(dataOrig); GMatrix* pData = pFilter->transformBatch(dataOrig); std::unique_ptr<GMatrix> hData(pData); // Convert to 3-column form GMatrix* pMatrix = new GMatrix(0, 3); std::unique_ptr<GMatrix> hMatrix(pMatrix); size_t dims = pData->cols(); for(size_t i = 0; i < pData->rows(); i++) { GVec& row = pData->row(i); for(size_t j = 0; j < dims; j++) { if(row[j] != UNKNOWN_REAL_VALUE) { GVec& vec = pMatrix->newRow(); vec[0] = (double)i; vec[1] = (double)j; vec[2] = row[j]; } } } // Train the collaborative filter pModel->train(*pMatrix); hMatrix.release(); pMatrix = NULL; // Predict values for missing elements for(size_t i = 0; i < pData->rows(); i++) { GVec& row = pData->row(i); for(size_t j = 0; j < dims; j++) { if(row[j] == UNKNOWN_REAL_VALUE) row[j] = pModel->predict(i, j); GAssert(row[j] != UNKNOWN_REAL_VALUE); } } // Convert the data back to its original form GMatrix* pOut = pFilter->untransformBatch(*pData); pOut->setRelation(hOrigRel.release()); pOut->print(cout); }
///Return a pointer to newly allocated data read from the command line ///represented by args. /// ///The returned matrix is allocated by new and it is the caller's ///responsibility to deallocate it. The suggested manner is to use a ///Holder<GMatrix*> /// ///In the returned matrix, all of the attributes designated as labels ///have been moved to the end and ignored attributes have been ///removed. The original indices of all the attributes are returned in ///originalIndices. /// ///\param args the command-line arguments /// ///\param pLabelDims (out parameter) the index of the first attribute ///which is designated a label. /// ///\param originalIndices the vector in which to place the original ///indices. originalIndices[i] is the index in the original data file ///of the attribute currently at index i. void loadDataWithSwitches(GMatrix& data, GArgReader& args, size_t& pLabelDims, std::vector<size_t>& originalIndices) { // Load the dataset by extension const char* szFilename = args.pop_string(); PathData pd; GFile::parsePath(szFilename, &pd); if(_stricmp(szFilename + pd.extStart, ".arff") == 0) data.loadArff(szFilename); else if(_stricmp(szFilename + pd.extStart, ".csv") == 0) { GCSVParser parser; parser.parse(data, szFilename); cerr << "\nParsing Report:\n"; for(size_t i = 0; i < data.cols(); i++) cerr << to_str(i) << ") " << parser.report(i) << "\n"; } else if(_stricmp(szFilename + pd.extStart, ".dat") == 0) { GCSVParser parser; parser.setSeparator('\0'); parser.parse(data, szFilename); cerr << "\nParsing Report:\n"; for(size_t i = 0; i < data.cols(); i++) cerr << to_str(i) << ") " << parser.report(i) << "\n"; } else throw Ex("Unsupported file format: ", szFilename + pd.extStart); //Make the initial list of original indices originalIndices.resize(data.cols()); for(std::size_t i = 0; i < originalIndices.size(); ++i){ originalIndices.at(i) = i; } // Parse params vector<size_t> ignore; vector<size_t> labels; while(args.next_is_flag()) { if(args.if_pop("-labels")) parseAttributeList(labels, args, data.cols()); else if(args.if_pop("-ignore")) parseAttributeList(ignore, args, data.cols()); else break; } // Throw out the ignored attributes std::sort(ignore.begin(), ignore.end()); for(size_t i = ignore.size() - 1; i < ignore.size(); i--) { data.deleteColumns(ignore[i], 1); originalIndices.erase(originalIndices.begin()+ignore[i]); for(size_t j = 0; j < labels.size(); j++) { if(labels[j] >= ignore[i]) { if(labels[j] == ignore[i]) throw Ex("Attribute ", to_str(labels[j]), " is both ignored and used as a label"); labels[j]--; } } } // Swap label columns to the end pLabelDims = std::max((size_t)1, labels.size()); for(size_t i = 0; i < labels.size(); i++) { size_t src = labels[i]; size_t dst = data.cols() - pLabelDims + i; if(src != dst) { data.swapColumns(src, dst); std::swap(originalIndices.at(src), originalIndices.at(dst)); for(size_t j = i + 1; j < labels.size(); j++) { if(labels[j] == dst) { labels[j] = src; break; } } } } }